diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..14eef1da --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2024-09-20T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.12194v3","updated":"2024-09-20T04:38:21Z","published":"2024-09-18T17:59:52Z","title":"Gender Representation and Bias in Indian Civil Service Mock Interviews","summary":" This paper makes three key contributions. First, via a substantial corpus of\n51,278 interview questions sourced from 888 YouTube videos of mock interviews\nof Indian civil service candidates, we demonstrate stark gender bias in the\nbroad nature of questions asked to male and female candidates. Second, our\nexperiments with large language models show a strong presence of gender bias in\nexplanations provided by the LLMs on the gender inference task. Finally, we\npresent a novel dataset of 51,278 interview questions that can inform future\nsocial science studies.\n","authors":["Somonnoy Banerjee","Sujan Dutta","Soumyajit Datta","Ashiqur R. KhudaBukhsh"],"pdf_url":"https://arxiv.org/pdf/2409.12194v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10997v3","updated":"2024-09-20T13:34:44Z","published":"2024-09-17T09:00:11Z","title":"Contextual Breach: Assessing the Robustness of Transformer-based QA\n Models","summary":" Contextual question-answering models are susceptible to adversarial\nperturbations to input context, commonly observed in real-world scenarios.\nThese adversarial noises are designed to degrade the performance of the model\nby distorting the textual input. We introduce a unique dataset that\nincorporates seven distinct types of adversarial noise into the context, each\napplied at five different intensity levels on the SQuAD dataset. To quantify\nthe robustness, we utilize robustness metrics providing a standardized measure\nfor assessing model performance across varying noise types and levels.\nExperiments on transformer-based question-answering models reveal robustness\nvulnerabilities and important insights into the model's performance in\nrealistic textual input.\n","authors":["Asir Saadat","Nahian Ibn Asad","Md Farhan Ishmam"],"pdf_url":"https://arxiv.org/pdf/2409.10997v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.12537v5","updated":"2024-09-20T08:49:37Z","published":"2023-10-19T07:39:00Z","title":"ExtractGPT: Exploring the Potential of Large Language Models for Product\n Attribute Value Extraction","summary":" E-commerce platforms require structured product data in the form of\nattribute-value pairs to offer features such as faceted product search or\nattribute-based product comparison. However, vendors often provide unstructured\nproduct descriptions, necessitating the extraction of attribute-value pairs\nfrom these texts. BERT-based extraction methods require large amounts of\ntask-specific training data and struggle with unseen attribute values. This\npaper explores using large language models (LLMs) as a more training-data\nefficient and robust alternative. We propose prompt templates for zero-shot and\nfew-shot scenarios, comparing textual and JSON-based target schema\nrepresentations. Our experiments show that GPT-4 achieves the highest average\nF1-score of 85% using detailed attribute descriptions and demonstrations.\nLlama-3-70B performs nearly as well, offering a competitive open-source\nalternative. GPT-4 surpasses the best PLM baseline by 5% in F1-score.\nFine-tuning GPT-3.5 increases the performance to the level of GPT-4 but reduces\nthe model's ability to generalize to unseen attribute values.\n","authors":["Alexander Brinkmann","Roee Shraga","Christian Bizer"],"pdf_url":"https://arxiv.org/pdf/2310.12537v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09239v3","updated":"2024-09-20T21:12:25Z","published":"2024-09-14T00:30:57Z","title":"Autoregressive + Chain of Thought = Recurrent: Recurrence's Role in\n Language Models' Computability and a Revisit of Recurrent Transformer","summary":" The Transformer architecture excels in a variety of language modeling tasks,\noutperforming traditional neural architectures such as RNN and LSTM. This is\npartially due to its elimination of recurrent connections, which allows for\nparallel training and a smoother flow of gradients. However, this move away\nfrom recurrent structures places the Transformer model at the lower end of\nChomsky's computational hierarchy, imposing limitations on its computational\nabilities. Consequently, even advanced Transformer-based models face\nconsiderable difficulties in tasks like counting, string reversal, and\nmultiplication. These tasks, though seemingly elementary, require a level of\ncomputational complexity that exceeds the capabilities of the Transformer\narchitecture. Concurrently, the emergence of ``Chain of Thought\" (CoT)\nprompting has enabled Transformer-based language models to tackle tasks that\nwere previously impossible or poorly executed. In this work, we thoroughly\ninvestigate the influence of recurrent structures in neural models on their\nreasoning abilities and computability, contrasting the role autoregression\nplays in the neural models' computational power. We then shed light on how the\nCoT approach can mimic recurrent computation and act as a bridge between\nautoregression and recurrence in the context of language models. It is this\napproximated recurrence that notably improves the model's performance and\ncomputational capacity. Moreover, we revisit recent recurrent-based Transformer\nmodel designs, focusing on their computational abilities through our proposed\nconcept of ``recurrence-completeness\" and identify key theoretical limitations\nin models like Linear Transformer and RWKV. Through this, we aim to provide\ninsight into the neural model architectures and prompt better model design.\n","authors":["Xiang Zhang","Muhammad Abdul-Mageed","Laks V. S. Lakshmanan"],"pdf_url":"https://arxiv.org/pdf/2409.09239v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11783v2","updated":"2024-09-20T04:13:33Z","published":"2024-09-18T08:07:37Z","title":"Development and bilingual evaluation of Japanese medical large language\n model within reasonably low computational resources","summary":" The recent success of large language models (LLMs) and the scaling law has\nled to a widespread adoption of larger models. Particularly in the healthcare\nindustry, there is an increasing demand for locally operated LLMs due to\nsecurity concerns. However, the majority of high quality open-source LLMs have\na size of 70B parameters, imposing significant financial burdens on users for\nGPU preparation and operation. To overcome these issues, we present a medical\nadaptation based on the recent 7B models, which enables the operation in low\ncomputational resources. We compare the performance on medical\nquestion-answering benchmarks in two languages (Japanese and English),\ndemonstrating that its scores reach parity with or surpass those of currently\nexisting medical LLMs that are ten times larger. We find that fine-tuning an\nEnglish-centric base model on Japanese medical dataset improves the score in\nboth language, supporting the effect of cross-lingual knowledge transfer. We\nhope that this study will alleviate financial challenges, serving as a stepping\nstone for clinical institutions to practically utilize LLMs locally. Our\nevaluation code is available at\nhttps://github.com/stardust-coder/japanese-lm-med-harness.\n","authors":["Issey Sukeda"],"pdf_url":"https://arxiv.org/pdf/2409.11783v2.pdf","comment":"18 pages, 9 tables"},{"id":"http://arxiv.org/abs/2407.12363v3","updated":"2024-09-20T01:59:52Z","published":"2024-07-17T07:39:16Z","title":"Conversational Query Reformulation with the Guidance of Retrieved\n Documents","summary":" Conversational search seeks to retrieve relevant passages for the given\nquestions in conversational question answering. Conversational Query\nReformulation (CQR) improves conversational search by refining the original\nqueries into de-contextualized forms to resolve the issues in the original\nqueries, such as omissions and coreferences. Previous CQR methods focus on\nimitating human written queries which may not always yield meaningful search\nresults for the retriever. In this paper, we introduce GuideCQR, a framework\nthat refines queries for CQR by leveraging key information from the initially\nretrieved documents. Specifically, GuideCQR extracts keywords and generates\nexpected answers from the retrieved documents, then unifies them with the\nqueries after filtering to add useful information that enhances the search\nprocess. Experimental results demonstrate that our proposed method achieves\nstate-of-the-art performance across multiple datasets, outperforming previous\nCQR methods. Additionally, we show that GuideCQR can get additional performance\ngains in conversational search using various types of queries, even for queries\nwritten by humans.\n","authors":["Jeonghyun Park","Hwanhee Lee"],"pdf_url":"https://arxiv.org/pdf/2407.12363v3.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2409.11232v2","updated":"2024-09-20T07:53:16Z","published":"2024-09-17T14:29:03Z","title":"Fast Analysis of the OpenAI O1-Preview Model in Solving Random K-SAT\n Problem: Does the LLM Solve the Problem Itself or Call an External SAT\n Solver?","summary":" In this manuscript, I present an analysis on the performance of OpenAI\nO1-preview model in solving random K-SAT instances for K$\\in {2,3,4}$ as a\nfunction of $\\alpha=M/N$ where $M$ is the number of clauses and $N$ is the\nnumber of variables of the satisfiable problem. I show that the model can call\nan external SAT solver to solve the instances, rather than solving them\ndirectly. Despite using external solvers, the model reports incorrect\nassignments as output. Moreover, I propose and present an analysis to quantify\nwhether the OpenAI O1-preview model demonstrates a spark of intelligence or\nmerely makes random guesses when outputting an assignment for a Boolean\nsatisfiability problem.\n","authors":["Raffaele Marino"],"pdf_url":"https://arxiv.org/pdf/2409.11232v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11114v2","updated":"2024-09-20T14:03:36Z","published":"2024-09-17T12:07:17Z","title":"Diversity-grounded Channel Prototypical Learning for Out-of-Distribution\n Intent Detection","summary":" In the realm of task-oriented dialogue systems, a robust intent detection\nmechanism must effectively handle malformed utterances encountered in\nreal-world scenarios. This study presents a novel fine-tuning framework for\nlarge language models (LLMs) aimed at enhancing in-distribution (ID) intent\nclassification and out-of-distribution (OOD) intent detection, which utilizes\nsemantic matching with prototypes derived from ID class names. By harnessing\nthe highly distinguishable representations of LLMs, we construct semantic\nprototypes for each ID class using a diversity-grounded prompt tuning approach.\nWe rigorously test our framework in a challenging OOD context, where ID and OOD\nclasses are semantically close yet distinct, referred to as \\emph{near} OOD\ndetection. For a thorough assessment, we benchmark our method against the\nprevalent fine-tuning approaches. The experimental findings reveal that our\nmethod demonstrates superior performance in both few-shot ID intent\nclassification and near-OOD intent detection tasks.\n","authors":["Bo Liu","Liming Zhan","Yujie Feng","Zexin Lu","Chengqiang Xie","Lei Xue","Albert Y. S. Lam","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2409.11114v2.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2409.11074v2","updated":"2024-09-20T15:47:51Z","published":"2024-09-17T11:03:46Z","title":"RoMath: A Mathematical Reasoning Benchmark in Romanian","summary":" Mathematics has long been conveyed through natural language, primarily for\nhuman understanding. With the rise of mechanized mathematics and proof\nassistants, there is a growing need to understand informal mathematical text,\nyet most existing benchmarks focus solely on English, overlooking other\nlanguages. This paper introduces RoMath, a Romanian mathematical reasoning\nbenchmark suite comprising three datasets: RoMath-Baccalaureate,\nRoMath-Competitions and RoMath-Synthetic, which cover a range of mathematical\ndomains and difficulty levels, aiming to improve non-English language models\nand promote multilingual AI development. By focusing on Romanian, a\nlow-resource language with unique linguistic features, RoMath addresses the\nlimitations of Anglo-centric models and emphasizes the need for dedicated\nresources beyond simple automatic translation. We benchmark several open-weight\nlanguage models, highlighting the importance of creating resources for\nunderrepresented languages. We make the code and dataset available.\n","authors":["Adrian Cosma","Ana-Maria Bucur","Emilian Radoi"],"pdf_url":"https://arxiv.org/pdf/2409.11074v2.pdf","comment":"4 Figures, 12 Tables"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.11301v2","updated":"2024-09-20T08:48:30Z","published":"2024-09-17T15:57:33Z","title":"TISIS : Trajectory Indexing for SImilarity Search","summary":" Social media platforms enable users to share diverse types of information,\nincluding geolocation data that captures their movement patterns. Such\ngeolocation data can be leveraged to reconstruct the trajectory of a user's\nvisited Points of Interest (POIs). A key requirement in numerous applications\nis the ability to measure the similarity between such trajectories, as this\nfacilitates the retrieval of trajectories that are similar to a given reference\ntrajectory. This is the main focus of our work. Existing methods predominantly\nrely on applying a similarity function to each candidate trajectory to identify\nthose that are sufficiently similar. However, this approach becomes\ncomputationally expensive when dealing with large-scale datasets. To mitigate\nthis challenge, we propose TISIS, an efficient method that uses trajectory\nindexing to quickly find similar trajectories that share common POIs in the\nsame order. Furthermore, to account for scenarios where POIs in trajectories\nmay not exactly match but are contextually similar, we introduce TISIS*, a\nvariant of TISIS that incorporates POI embeddings. This extension allows for\nmore comprehensive retrieval of similar trajectories by considering semantic\nsimilarities between POIs, beyond mere exact matches. Extensive experimental\nevaluations demonstrate that the proposed approach significantly outperforms a\nbaseline method based on the well-known Longest Common SubSequence (LCSS)\nalgorithm, yielding substantial performance improvements across various\nreal-world datasets.\n","authors":["Sara Jarrad","Hubert Naacke","Stephane Gancarski"],"pdf_url":"https://arxiv.org/pdf/2409.11301v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.08479v2","updated":"2024-09-20T04:52:16Z","published":"2024-09-13T02:08:47Z","title":"Exploring Information Retrieval Landscapes: An Investigation of a Novel\n Evaluation Techniques and Comparative Document Splitting Methods","summary":" The performance of Retrieval-Augmented Generation (RAG) systems in\ninformation retrieval is significantly influenced by the characteristics of the\ndocuments being processed. In this study, the structured nature of textbooks,\nthe conciseness of articles, and the narrative complexity of novels are shown\nto require distinct retrieval strategies. A comparative evaluation of multiple\ndocument-splitting methods reveals that the Recursive Character Splitter\noutperforms the Token-based Splitter in preserving contextual integrity. A\nnovel evaluation technique is introduced, utilizing an open-source model to\ngenerate a comprehensive dataset of question-and-answer pairs, simulating\nrealistic retrieval scenarios to enhance testing efficiency and metric\nreliability. The evaluation employs weighted scoring metrics, including\nSequenceMatcher, BLEU, METEOR, and BERT Score, to assess the system's accuracy\nand relevance. This approach establishes a refined standard for evaluating the\nprecision of RAG systems, with future research focusing on optimizing chunk and\noverlap sizes to improve retrieval accuracy and efficiency.\n","authors":["Esmaeil Narimissa","David Raithel"],"pdf_url":"https://arxiv.org/pdf/2409.08479v2.pdf","comment":"This article is 16 pages long and includes detailed comparisons of\n RAG systems and document splitting techniques"}]},"2024-09-18T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.12191v1","updated":"2024-09-18T17:59:32Z","published":"2024-09-18T17:59:32Z","title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at\n Any Resolution","summary":" We present the Qwen2-VL Series, an advanced upgrade of the previous Qwen-VL\nmodels that redefines the conventional predetermined-resolution approach in\nvisual processing. Qwen2-VL introduces the Naive Dynamic Resolution mechanism,\nwhich enables the model to dynamically process images of varying resolutions\ninto different numbers of visual tokens. This approach allows the model to\ngenerate more efficient and accurate visual representations, closely aligning\nwith human perceptual processes. The model also integrates Multimodal Rotary\nPosition Embedding (M-RoPE), facilitating the effective fusion of positional\ninformation across text, images, and videos. We employ a unified paradigm for\nprocessing both images and videos, enhancing the model's visual perception\ncapabilities. To explore the potential of large multimodal models, Qwen2-VL\ninvestigates the scaling laws for large vision-language models (LVLMs). By\nscaling both the model size-with versions at 2B, 8B, and 72B parameters-and the\namount of training data, the Qwen2-VL Series achieves highly competitive\nperformance. Notably, the Qwen2-VL-72B model achieves results comparable to\nleading models such as GPT-4o and Claude3.5-Sonnet across various multimodal\nbenchmarks, outperforming other generalist models. Code is available at\n\\url{https://github.com/QwenLM/Qwen2-VL}.\n","authors":["Peng Wang","Shuai Bai","Sinan Tan","Shijie Wang","Zhihao Fan","Jinze Bai","Keqin Chen","Xuejing Liu","Jialin Wang","Wenbin Ge","Yang Fan","Kai Dang","Mengfei Du","Xuancheng Ren","Rui Men","Dayiheng Liu","Chang Zhou","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2409.12191v1.pdf","comment":"Code is available at https://github.com/QwenLM/Qwen2-VL"},{"id":"http://arxiv.org/abs/2409.12186v1","updated":"2024-09-18T17:57:57Z","published":"2024-09-18T17:57:57Z","title":"Qwen2.5-Coder Technical Report","summary":" In this report, we introduce the Qwen2.5-Coder series, a significant upgrade\nfrom its predecessor, CodeQwen1.5. This series includes two models:\nQwen2.5-Coder-1.5B and Qwen2.5-Coder-7B. As a code-specific model,\nQwen2.5-Coder is built upon the Qwen2.5 architecture and continues pretrained\non a vast corpus of over 5.5 trillion tokens. Through meticulous data cleaning,\nscalable synthetic data generation, and balanced data mixing, Qwen2.5-Coder\ndemonstrates impressive code generation capabilities while retaining general\nversatility. The model has been evaluated on a wide range of code-related\ntasks, achieving state-of-the-art (SOTA) performance across more than 10\nbenchmarks, including code generation, completion, reasoning, and repair,\nconsistently outperforming larger models of the same model size. We believe\nthat the release of the Qwen2.5-Coder series will not only push the boundaries\nof research in code intelligence but also, through its permissive licensing,\nencourage broader adoption by developers in real-world applications.\n","authors":["Binyuan Hui","Jian Yang","Zeyu Cui","Jiaxi Yang","Dayiheng Liu","Lei Zhang","Tianyu Liu","Jiajun Zhang","Bowen Yu","Kai Dang","An Yang","Rui Men","Fei Huang","Xingzhang Ren","Xuancheng Ren","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2409.12186v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12183v1","updated":"2024-09-18T17:55:00Z","published":"2024-09-18T17:55:00Z","title":"To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic\n reasoning","summary":" Chain-of-thought (CoT) via prompting is the de facto method for eliciting\nreasoning capabilities from large language models (LLMs). But for what kinds of\ntasks is this extra ``thinking'' really helpful? To analyze this, we conducted\na quantitative meta-analysis covering over 100 papers using CoT and ran our own\nevaluations of 20 datasets across 14 models. Our results show that CoT gives\nstrong performance benefits primarily on tasks involving math or logic, with\nmuch smaller gains on other types of tasks. On MMLU, directly generating the\nanswer without CoT leads to almost identical accuracy as CoT unless the\nquestion or model's response contains an equals sign, indicating symbolic\noperations and reasoning. Following this finding, we analyze the behavior of\nCoT on these problems by separating planning and execution and comparing\nagainst tool-augmented LLMs. Much of CoT's gain comes from improving symbolic\nexecution, but it underperforms relative to using a symbolic solver. Our\nresults indicate that CoT can be applied selectively, maintaining performance\nwhile saving inference costs. Furthermore, they suggest a need to move beyond\nprompt-based CoT to new paradigms that better leverage intermediate computation\nacross the whole range of LLM applications.\n","authors":["Zayne Sprague","Fangcong Yin","Juan Diego Rodriguez","Dongwei Jiang","Manya Wadhwa","Prasann Singhal","Xinyu Zhao","Xi Ye","Kyle Mahowald","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2409.12183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12180v1","updated":"2024-09-18T17:52:53Z","published":"2024-09-18T17:52:53Z","title":"Finetuning Language Models to Emit Linguistic Expressions of Uncertainty","summary":" Large language models (LLMs) are increasingly employed in information-seeking\nand decision-making tasks. Despite their broad utility, LLMs tend to generate\ninformation that conflicts with real-world facts, and their persuasive style\ncan make these inaccuracies appear confident and convincing. As a result,\nend-users struggle to consistently align the confidence expressed by LLMs with\nthe accuracy of their predictions, often leading to either blind trust in all\noutputs or a complete disregard for their reliability. In this work, we explore\nsupervised finetuning on uncertainty-augmented predictions as a method to\ndevelop models that produce linguistic expressions of uncertainty.\nSpecifically, we measure the calibration of pre-trained models and then\nfine-tune language models to generate calibrated linguistic expressions of\nuncertainty. Through experiments on various question-answering datasets, we\ndemonstrate that LLMs are well-calibrated in assessing their predictions, and\nsupervised finetuning based on the model's own confidence leads to\nwell-calibrated expressions of uncertainty, particularly for single-claim\nanswers.\n","authors":["Arslan Chaudhry","Sridhar Thiagarajan","Dilan Gorur"],"pdf_url":"https://arxiv.org/pdf/2409.12180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09249v2","updated":"2024-09-18T17:44:08Z","published":"2024-09-14T01:21:56Z","title":"NovAScore: A New Automated Metric for Evaluating Document Level Novelty","summary":" The rapid expansion of online content has intensified the issue of\ninformation redundancy, underscoring the need for solutions that can identify\ngenuinely new information. Despite this challenge, the research community has\nseen a decline in focus on novelty detection, particularly with the rise of\nlarge language models (LLMs). Additionally, previous approaches have relied\nheavily on human annotation, which is time-consuming, costly, and particularly\nchallenging when annotators must compare a target document against a vast\nnumber of historical documents. In this work, we introduce NovAScore (Novelty\nEvaluation in Atomicity Score), an automated metric for evaluating\ndocument-level novelty. NovAScore aggregates the novelty and salience scores of\natomic information, providing high interpretability and a detailed analysis of\na document's novelty. With its dynamic weight adjustment scheme, NovAScore\noffers enhanced flexibility and an additional dimension to assess both the\nnovelty level and the importance of information within a document. Our\nexperiments show that NovAScore strongly correlates with human judgments of\nnovelty, achieving a 0.626 Point-Biserial correlation on the TAP-DLND 1.0\ndataset and a 0.920 Pearson correlation on an internal human-annotated dataset.\n","authors":["Lin Ai","Ziwei Gong","Harshsaiprasad Deshpande","Alexander Johnson","Emmy Phung","Ahmad Emami","Julia Hirschberg"],"pdf_url":"https://arxiv.org/pdf/2409.09249v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12172v1","updated":"2024-09-18T17:38:25Z","published":"2024-09-18T17:38:25Z","title":"You Only Read Once (YORO): Learning to Internalize Database Knowledge\n for Text-to-SQL","summary":" While significant progress has been made on the text-to-SQL task, recent\nsolutions repeatedly encode the same database schema for every question,\nresulting in unnecessary high inference cost and often overlooking crucial\ndatabase knowledge. To address these issues, we propose You Only Read Once\n(YORO), a novel paradigm that directly internalizes database knowledge into the\nparametric knowledge of a text-to-SQL model during training and eliminates the\nneed for schema encoding during inference. YORO significantly reduces the input\ntoken length by 66%-98%. Despite its shorter inputs, our empirical results\ndemonstrate YORO's competitive performances with traditional systems on three\nbenchmarks as well as its significant outperformance on large databases.\nFurthermore, YORO excels in handling questions with challenging value\nretrievals such as abbreviation.\n","authors":["Hideo Kobayashi","Wuwei Lan","Peng Shi","Shuaichen Chang","Jiang Guo","Henghui Zhu","Zhiguo Wang","Patrick Ng"],"pdf_url":"https://arxiv.org/pdf/2409.12172v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10289v2","updated":"2024-09-18T17:30:50Z","published":"2024-09-16T13:56:17Z","title":"ReflectDiffu:Reflect between Emotion-intent Contagion and Mimicry for\n Empathetic Response Generation via a RL-Diffusion Framework","summary":" Empathetic response generation necessitates the integration of emotional and\nintentional dynamics to foster meaningful interactions. Existing research\neither neglects the intricate interplay between emotion and intent, leading to\nsuboptimal controllability of empathy, or resorts to large language models\n(LLMs), which incur significant computational overhead. In this paper, we\nintroduce ReflectDiffu, a lightweight and comprehensive framework for\nempathetic response generation. This framework incorporates emotion contagion\nto augment emotional expressiveness and employs an emotion-reasoning mask to\npinpoint critical emotional elements. Additionally, it integrates intent\nmimicry within reinforcement learning for refinement during diffusion. By\nharnessing an intent twice reflect the mechanism of\nExploring-Sampling-Correcting, ReflectDiffu adeptly translates emotional\ndecision-making into precise intent actions, thereby addressing empathetic\nresponse misalignments stemming from emotional misrecognition. Through\nreflection, the framework maps emotional states to intents, markedly enhancing\nboth response empathy and flexibility. Comprehensive experiments reveal that\nReflectDiffu outperforms existing models regarding relevance, controllability,\nand informativeness, achieving state-of-the-art results in both automatic and\nhuman evaluations.\n","authors":["Jiahao Yuan","Zixiang Di","Zhiqing Cui","Guisong Yang","Usman Naseem"],"pdf_url":"https://arxiv.org/pdf/2409.10289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12147v1","updated":"2024-09-18T17:12:41Z","published":"2024-09-18T17:12:41Z","title":"MAgICoRe: Multi-Agent, Iterative, Coarse-to-Fine Refinement for\n Reasoning","summary":" Large Language Models' (LLM) reasoning can be improved using test-time\naggregation strategies, i.e., generating multiple samples and voting among\ngenerated samples. While these improve performance, they often reach a\nsaturation point. Refinement offers an alternative by using LLM-generated\nfeedback to improve solution quality. However, refinement introduces 3 key\nchallenges: (1) Excessive refinement: Uniformly refining all instances can\nover-correct and reduce the overall performance. (2) Inability to localize and\naddress errors: LLMs have a limited ability to self-correct and struggle to\nidentify and correct their own mistakes. (3) Insufficient refinement: Deciding\nhow many iterations of refinement are needed is non-trivial, and stopping too\nsoon could leave errors unaddressed. To tackle these issues, we propose\nMAgICoRe, which avoids excessive refinement by categorizing problem difficulty\nas easy or hard, solving easy problems with coarse-grained aggregation and hard\nones with fine-grained and iterative multi-agent refinement. To improve error\nlocalization, we incorporate external step-wise reward model (RM) scores.\nMoreover, to ensure effective refinement, we employ a multi-agent loop with\nthree agents: Solver, Reviewer (which generates targeted feedback based on\nstep-wise RM scores), and the Refiner (which incorporates feedback). To ensure\nsufficient refinement, we re-evaluate updated solutions, iteratively initiating\nfurther rounds of refinement. We evaluate MAgICoRe on Llama-3-8B and GPT-3.5\nand show its effectiveness across 5 math datasets. Even one iteration of\nMAgICoRe beats Self-Consistency by 3.4%, Best-of-k by 3.2%, and Self-Refine by\n4.0% while using less than half the samples. Unlike iterative refinement with\nbaselines, MAgICoRe continues to improve with more iterations. Finally, our\nablations highlight the importance of MAgICoRe's RMs and multi-agent\ncommunication.\n","authors":["Justin Chih-Yao Chen","Archiki Prasad","Swarnadeep Saha","Elias Stengel-Eskin","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2409.12147v1.pdf","comment":"22 pages, code: https://github.com/dinobby/MAgICoRe"},{"id":"http://arxiv.org/abs/2409.12136v1","updated":"2024-09-18T17:00:20Z","published":"2024-09-18T17:00:20Z","title":"GRIN: GRadient-INformed MoE","summary":" Mixture-of-Experts (MoE) models scale more effectively than dense models due\nto sparse computation through expert routing, selectively activating only a\nsmall subset of expert modules. However, sparse computation challenges\ntraditional training practices, as discrete expert routing hinders standard\nbackpropagation and thus gradient-based optimization, which are the cornerstone\nof deep learning. To better pursue the scaling power of MoE, we introduce GRIN\n(GRadient-INformed MoE training), which incorporates sparse gradient estimation\nfor expert routing and configures model parallelism to avoid token dropping.\nApplying GRIN to autoregressive language modeling, we develop a top-2\n16$\\times$3.8B MoE model. Our model, with only 6.6B activated parameters,\noutperforms a 7B dense model and matches the performance of a 14B dense model\ntrained on the same data. Extensive evaluations across diverse tasks\ndemonstrate the potential of GRIN to significantly enhance MoE efficacy,\nachieving 79.4 on MMLU, 83.7 on HellaSwag, 74.4 on HumanEval, and 58.9 on MATH.\n","authors":["Liyuan Liu","Young Jin Kim","Shuohang Wang","Chen Liang","Yelong Shen","Hao Cheng","Xiaodong Liu","Masahiro Tanaka","Xiaoxia Wu","Wenxiang Hu","Vishrav Chaudhary","Zeqi Lin","Chenruidong Zhang","Jilong Xue","Hany Awadalla","Jianfeng Gao","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2409.12136v1.pdf","comment":"58 pages"},{"id":"http://arxiv.org/abs/2409.12134v1","updated":"2024-09-18T16:56:06Z","published":"2024-09-18T16:56:06Z","title":"BERT-VBD: Vietnamese Multi-Document Summarization Framework","summary":" In tackling the challenge of Multi-Document Summarization (MDS), numerous\nmethods have been proposed, spanning both extractive and abstractive\nsummarization techniques. However, each approach has its own limitations,\nmaking it less effective to rely solely on either one. An emerging and\npromising strategy involves a synergistic fusion of extractive and abstractive\nsummarization methods. Despite the plethora of studies in this domain, research\non the combined methodology remains scarce, particularly in the context of\nVietnamese language processing. This paper presents a novel Vietnamese MDS\nframework leveraging a two-component pipeline architecture that integrates\nextractive and abstractive techniques. The first component employs an\nextractive approach to identify key sentences within each document. This is\nachieved by a modification of the pre-trained BERT network, which derives\nsemantically meaningful phrase embeddings using siamese and triplet network\nstructures. The second component utilizes the VBD-LLaMA2-7B-50b model for\nabstractive summarization, ultimately generating the final summary document.\nOur proposed framework demonstrates a positive performance, attaining ROUGE-2\nscores of 39.6% on the VN-MDS dataset and outperforming the state-of-the-art\nbaselines.\n","authors":["Tuan-Cuong Vuong","Trang Mai Xuan","Thien Van Luong"],"pdf_url":"https://arxiv.org/pdf/2409.12134v1.pdf","comment":"10 pages"},{"id":"http://arxiv.org/abs/2409.12126v1","updated":"2024-09-18T16:51:02Z","published":"2024-09-18T16:51:02Z","title":"Linguini: A benchmark for language-agnostic linguistic reasoning","summary":" We propose a new benchmark to measure a language model's linguistic reasoning\nskills without relying on pre-existing language-specific knowledge. The test\ncovers 894 questions grouped in 160 problems across 75 (mostly) extremely\nlow-resource languages, extracted from the International Linguistic Olympiad\ncorpus. To attain high accuracy on this benchmark, models don't need previous\nknowledge of the tested language, as all the information needed to solve the\nlinguistic puzzle is presented in the context. We find that, while all analyzed\nmodels rank below 25% accuracy, there is a significant gap between open and\nclosed models, with the best-performing proprietary model at 24.05% and the\nbest-performing open model at 8.84%.\n","authors":["Eduardo Sánchez","Belen Alastruey","Christophe Ropers","Pontus Stenetorp","Mikel Artetxe","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2409.12126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12122v1","updated":"2024-09-18T16:45:37Z","published":"2024-09-18T16:45:37Z","title":"Qwen2.5-Math Technical Report: Toward Mathematical Expert Model via\n Self-Improvement","summary":" In this report, we present a series of math-specific large language models:\nQwen2.5-Math and Qwen2.5-Math-Instruct-1.5B/7B/72B. The core innovation of the\nQwen2.5 series lies in integrating the philosophy of self-improvement\nthroughout the entire pipeline, from pre-training and post-training to\ninference: (1) During the pre-training phase, Qwen2-Math-Instruct is utilized\nto generate large-scale, high-quality mathematical data. (2) In the\npost-training phase, we develop a reward model (RM) by conducting massive\nsampling from Qwen2-Math-Instruct. This RM is then applied to the iterative\nevolution of data in supervised fine-tuning (SFT). With a stronger SFT model,\nit's possible to iteratively train and update the RM, which in turn guides the\nnext round of SFT data iteration. On the final SFT model, we employ the\nultimate RM for reinforcement learning, resulting in the Qwen2.5-Math-Instruct.\n(3) Furthermore, during the inference stage, the RM is used to guide sampling,\noptimizing the model's performance.\n Qwen2.5-Math-Instruct supports both Chinese and English, and possess advanced\nmathematical reasoning capabilities, including Chain-of-Thought (CoT) and\nTool-Integrated Reasoning (TIR). We evaluate our models on 10 mathematics\ndatasets in both English and Chinese, such as GSM8K, MATH, GaoKao, AMC23, and\nAIME24, covering a range of difficulties from grade school level to math\ncompetition problems.\n","authors":["An Yang","Beichen Zhang","Binyuan Hui","Bofei Gao","Bowen Yu","Chengpeng Li","Dayiheng Liu","Jianhong Tu","Jingren Zhou","Junyang Lin","Keming Lu","Mingfeng Xue","Runji Lin","Tianyu Liu","Xingzhang Ren","Zhenru Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.12122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12117v1","updated":"2024-09-18T16:39:10Z","published":"2024-09-18T16:39:10Z","title":"Low Frame-rate Speech Codec: a Codec Designed for Fast High-quality\n Speech LLM Training and Inference","summary":" Large language models (LLMs) have significantly advanced audio processing\nthrough audio codecs that convert audio into discrete tokens, enabling the\napplication of language modeling techniques to audio data. However, audio\ncodecs often operate at high frame rates, resulting in slow training and\ninference, especially for autoregressive models. To address this challenge, we\npresent the Low Frame-rate Speech Codec (LFSC): a neural audio codec that\nleverages finite scalar quantization and adversarial training with large speech\nlanguage models to achieve high-quality audio compression with a 1.89 kbps\nbitrate and 21.5 frames per second. We demonstrate that our novel codec can\nmake the inference of LLM-based text-to-speech models around three times faster\nwhile improving intelligibility and producing quality comparable to previous\nmodels.\n","authors":["Edresson Casanova","Ryan Langman","Paarth Neekhara","Shehzeen Hussain","Jason Li","Subhankar Ghosh","Ante Jukić","Sang-gil Lee"],"pdf_url":"https://arxiv.org/pdf/2409.12117v1.pdf","comment":"Submitted to ICASSP 2025"},{"id":"http://arxiv.org/abs/2409.12106v1","updated":"2024-09-18T16:26:22Z","published":"2024-09-18T16:26:22Z","title":"Measuring Human and AI Values based on Generative Psychometrics with\n Large Language Models","summary":" Human values and their measurement are long-standing interdisciplinary\ninquiry. Recent advances in AI have sparked renewed interest in this area, with\nlarge language models (LLMs) emerging as both tools and subjects of value\nmeasurement. This work introduces Generative Psychometrics for Values (GPV), an\nLLM-based, data-driven value measurement paradigm, theoretically grounded in\ntext-revealed selective perceptions. We begin by fine-tuning an LLM for\naccurate perception-level value measurement and verifying the capability of\nLLMs to parse texts into perceptions, forming the core of the GPV pipeline.\nApplying GPV to human-authored blogs, we demonstrate its stability, validity,\nand superiority over prior psychological tools. Then, extending GPV to LLM\nvalue measurement, we advance the current art with 1) a psychometric\nmethodology that measures LLM values based on their scalable and free-form\noutputs, enabling context-specific measurement; 2) a comparative analysis of\nmeasurement paradigms, indicating response biases of prior methods; and 3) an\nattempt to bridge LLM values and their safety, revealing the predictive power\nof different value systems and the impacts of various values on LLM safety.\nThrough interdisciplinary efforts, we aim to leverage AI for next-generation\npsychometrics and psychometrics for value-aligned AI.\n","authors":["Haoran Ye","Yuhang Xie","Yuanyi Ren","Hanjun Fang","Xin Zhang","Guojie Song"],"pdf_url":"https://arxiv.org/pdf/2409.12106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10649v2","updated":"2024-09-18T15:48:09Z","published":"2024-09-16T18:29:19Z","title":"Visualizing Temporal Topic Embeddings with a Compass","summary":" Dynamic topic modeling is useful at discovering the development and change in\nlatent topics over time. However, present methodology relies on algorithms that\nseparate document and word representations. This prevents the creation of a\nmeaningful embedding space where changes in word usage and documents can be\ndirectly analyzed in a temporal context. This paper proposes an expansion of\nthe compass-aligned temporal Word2Vec methodology into dynamic topic modeling.\nSuch a method allows for the direct comparison of word and document embeddings\nacross time in dynamic topics. This enables the creation of visualizations that\nincorporate temporal word embeddings within the context of documents into topic\nvisualizations. In experiments against the current state-of-the-art, our\nproposed method demonstrates overall competitive performance in topic relevancy\nand diversity across temporal datasets of varying size. Simultaneously, it\nprovides insightful visualizations focused on temporal word embeddings while\nmaintaining the insights provided by global topic evolution, advancing our\nunderstanding of how topics evolve over time.\n","authors":["Daniel Palamarchuk","Lemara Williams","Brian Mayer","Thomas Danielson","Rebecca Faust","Larry Deschaine","Chris North"],"pdf_url":"https://arxiv.org/pdf/2409.10649v2.pdf","comment":"11 pages, 9 figures, conference paper"},{"id":"http://arxiv.org/abs/2409.12060v1","updated":"2024-09-18T15:33:48Z","published":"2024-09-18T15:33:48Z","title":"PARAPHRASUS : A Comprehensive Benchmark for Evaluating Paraphrase\n Detection Models","summary":" The task of determining whether two texts are paraphrases has long been a\nchallenge in NLP. However, the prevailing notion of paraphrase is often quite\nsimplistic, offering only a limited view of the vast spectrum of paraphrase\nphenomena. Indeed, we find that evaluating models in a paraphrase dataset can\nleave uncertainty about their true semantic understanding. To alleviate this,\nwe release paraphrasus, a benchmark designed for multi-dimensional assessment\nof paraphrase detection models and finer model selection. We find that\nparaphrase detection models under a fine-grained evaluation lens exhibit\ntrade-offs that cannot be captured through a single classification dataset.\n","authors":["Andrianos Michail","Simon Clematide","Juri Opitz"],"pdf_url":"https://arxiv.org/pdf/2409.12060v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12059v1","updated":"2024-09-18T15:32:48Z","published":"2024-09-18T15:32:48Z","title":"Dual-Layer Training and Decoding of Large Language Model with\n Simultaneously Thinking and Speaking","summary":" Large Language Model can reasonably understand and generate human expressions\nbut may lack of thorough thinking and reasoning mechanisms. Recently there have\nbeen several studies which enhance the thinking ability of language models but\nmost of them are not data-driven or training-based. In this paper, we are\nmotivated by the cognitive mechanism in the natural world, and design a novel\nmodel architecture called TaS which allows it to first consider the thoughts\nand then express the response based upon the query. We design several pipelines\nto annotate or generate the thought contents from prompt-response samples, then\nadd language heads in a middle layer which behaves as the thinking layer. We\ntrain the language model by the thoughts-augmented data and successfully let\nthe thinking layer automatically generate reasonable thoughts and finally\noutput more reasonable responses. Both qualitative examples and quantitative\nresults validate the effectiveness and performance of TaS. Our code is\navailable at https://anonymous.4open.science/r/TadE.\n","authors":["Ningyuan Xi","Xiaoyu Wang","Yetao Wu","Teng Chen","Qingqing Gu","Jinxian Qu","Zhonglin Jiang","Yong Chen","Luo Ji"],"pdf_url":"https://arxiv.org/pdf/2409.12059v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.19472v3","updated":"2024-09-18T15:30:33Z","published":"2023-05-31T00:55:40Z","title":"PlaSma: Making Small Language Models Better Procedural Knowledge Models\n for (Counterfactual) Planning","summary":" Procedural planning, which entails decomposing a high-level goal into a\nsequence of temporally ordered steps, is an important yet intricate task for\nmachines. It involves integrating common-sense knowledge to reason about\ncomplex and often contextualized situations, e.g. ``scheduling a doctor's\nappointment without a phone''. While current approaches show encouraging\nresults using large language models (LLMs), they are hindered by drawbacks such\nas costly API calls and reproducibility issues. In this paper, we advocate\nplanning using smaller language models. We present PlaSma, a novel two-pronged\napproach to endow small language models with procedural knowledge and\n(constrained) language planning capabilities. More concretely, we develop\nsymbolic procedural knowledge distillation to enhance the commonsense knowledge\nin small language models and an inference-time algorithm to facilitate more\nstructured and accurate reasoning. In addition, we introduce a new related\ntask, Replanning, that requires a revision of a plan to cope with a constrained\nsituation. In both the planning and replanning settings, we show that\norders-of-magnitude smaller models (770M-11B parameters) can compete and often\nsurpass their larger teacher models' capabilities. Finally, we showcase\nsuccessful application of PlaSma in an embodied environment, VirtualHome.\n","authors":["Faeze Brahman","Chandra Bhagavatula","Valentina Pyatkin","Jena D. Hwang","Xiang Lorraine Li","Hirona J. Arai","Soumya Sanyal","Keisuke Sakaguchi","Xiang Ren","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2305.19472v3.pdf","comment":"ICLR 2024 version , 31 pages"},{"id":"http://arxiv.org/abs/2408.14496v3","updated":"2024-09-18T15:25:49Z","published":"2024-08-23T16:33:57Z","title":"A New Era in Computational Pathology: A Survey on Foundation and\n Vision-Language Models","summary":" Recent advances in deep learning have completely transformed the domain of\ncomputational pathology (CPath). More specifically, it has altered the\ndiagnostic workflow of pathologists by integrating foundation models (FMs) and\nvision-language models (VLMs) in their assessment and decision-making process.\nThe limitations of existing deep learning approaches in CPath can be overcome\nby FMs through learning a representation space that can be adapted to a wide\nvariety of downstream tasks without explicit supervision. Deploying VLMs allow\npathology reports written in natural language be used as rich semantic\ninformation sources to improve existing models as well as generate predictions\nin natural language form. In this survey, a holistic and systematic overview of\nrecent innovations in FMs and VLMs in CPath is presented. Furthermore, the\ntools, datasets and training schemes for these models are summarized in\naddition to categorizing them into distinct groups. This extensive survey\nhighlights the current trends in CPath and its possible revolution through the\nuse of FMs and VLMs in the future.\n","authors":["Dibaloke Chanda","Milan Aryal","Nasim Yahya Soltani","Masoud Ganji"],"pdf_url":"https://arxiv.org/pdf/2408.14496v3.pdf","comment":"20 pages, 19 figures and 9 tables"},{"id":"http://arxiv.org/abs/2409.12042v1","updated":"2024-09-18T15:03:04Z","published":"2024-09-18T15:03:04Z","title":"ASR Benchmarking: Need for a More Representative Conversational Dataset","summary":" Automatic Speech Recognition (ASR) systems have achieved remarkable\nperformance on widely used benchmarks such as LibriSpeech and Fleurs. However,\nthese benchmarks do not adequately reflect the complexities of real-world\nconversational environments, where speech is often unstructured and contains\ndisfluencies such as pauses, interruptions, and diverse accents. In this study,\nwe introduce a multilingual conversational dataset, derived from TalkBank,\nconsisting of unstructured phone conversation between adults. Our results show\na significant performance drop across various state-of-the-art ASR models when\ntested in conversational settings. Furthermore, we observe a correlation\nbetween Word Error Rate and the presence of speech disfluencies, highlighting\nthe critical need for more realistic, conversational ASR benchmarks.\n","authors":["Gaurav Maheshwari","Dmitry Ivanov","Théo Johannet","Kevin El Haddad"],"pdf_url":"https://arxiv.org/pdf/2409.12042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.09548v4","updated":"2024-09-18T14:26:21Z","published":"2023-05-16T15:45:59Z","title":"Measuring Dimensions of Self-Presentation in Twitter Bios and their\n Links to Misinformation Sharing","summary":" Social media platforms provide users with a profile description field,\ncommonly known as a ``bio,\" where they can present themselves to the world. A\ngrowing literature shows that text in these bios can improve our understanding\nof online self-presentation and behavior, but existing work relies exclusively\non keyword-based approaches to do so. We here propose and evaluate a suite of\n\\hl{simple, effective, and theoretically motivated} approaches to embed bios in\nspaces that capture salient dimensions of social meaning, such as age and\npartisanship. We \\hl{evaluate our methods on four tasks, showing that the\nstrongest one out-performs several practical baselines.} We then show the\nutility of our method in helping understand associations between\nself-presentation and the sharing of URLs from low-quality news sites on\nTwitter\\hl{, with a particular focus on explore the interactions between age\nand partisanship, and exploring the effects of self-presentations of\nreligiosity}. Our work provides new tools to help computational social\nscientists make use of information in bios, and provides new insights into how\nmisinformation sharing may be perceived on Twitter.\n","authors":["Navid Madani","Rabiraj Bandyopadhyay","Briony Swire-Thompson","Michael Miller Yoder","Kenneth Joseph"],"pdf_url":"https://arxiv.org/pdf/2305.09548v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00297v4","updated":"2024-09-18T14:05:31Z","published":"2024-03-30T09:20:43Z","title":"A Hybrid Transformer and Attention Based Recurrent Neural Network for\n Robust and Interpretable Sentiment Analysis of Tweets","summary":" Sentiment analysis is crucial for understanding public opinion and consumer\nbehavior. Existing models face challenges with linguistic diversity,\ngeneralizability, and explainability. We propose TRABSA, a hybrid framework\nintegrating transformer-based architectures, attention mechanisms, and BiLSTM\nnetworks to address this. Leveraging RoBERTa-trained on 124M tweets, we bridge\ngaps in sentiment analysis benchmarks, ensuring state-of-the-art accuracy.\nAugmenting datasets with tweets from 32 countries and US states, we compare six\nword-embedding techniques and three lexicon-based labeling techniques,\nselecting the best for optimal sentiment analysis. TRABSA outperforms\ntraditional ML and deep learning models with 94% accuracy and significant\nprecision, recall, and F1-score gains. Evaluation across diverse datasets\ndemonstrates consistent superiority and generalizability. SHAP and LIME\nanalyses enhance interpretability, improving confidence in predictions. Our\nstudy facilitates pandemic resource management, aiding resource planning,\npolicy formation, and vaccination tactics.\n","authors":["Md Abrar Jahin","Md Sakib Hossain Shovon","M. F. Mridha","Md Rashedul Islam","Yutaka Watanobe"],"pdf_url":"https://arxiv.org/pdf/2404.00297v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00099v3","updated":"2024-09-18T13:45:08Z","published":"2024-04-30T18:00:02Z","title":"Creative Beam Search: LLM-as-a-Judge For Improving Response Generation","summary":" Large language models are revolutionizing several areas, including artificial\ncreativity. However, the process of generation in machines profoundly diverges\nfrom that observed in humans. In particular, machine generation is\ncharacterized by a lack of intentionality and an underlying creative process.\nWe propose a method called Creative Beam Search that uses Diverse Beam Search\nand LLM-as-a-Judge to perform response generation and response validation. The\nresults of a qualitative experiment show how our approach can provide better\noutput than standard sampling techniques. We also show that the response\nvalidation step is a necessary complement to the response generation step.\n","authors":["Giorgio Franceschelli","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2405.00099v3.pdf","comment":"Presented as a short paper at the 15th International Conference on\n Computational Creativity (ICCC'24)"},{"id":"http://arxiv.org/abs/2409.10576v2","updated":"2024-09-18T13:27:43Z","published":"2024-09-15T15:21:45Z","title":"Language Models and Retrieval Augmented Generation for Automated\n Structured Data Extraction from Diagnostic Reports","summary":" Purpose: To develop and evaluate an automated system for extracting\nstructured clinical information from unstructured radiology and pathology\nreports using open-weights large language models (LMs) and retrieval augmented\ngeneration (RAG), and to assess the effects of model configuration variables on\nextraction performance. Methods and Materials: The study utilized two datasets:\n7,294 radiology reports annotated for Brain Tumor Reporting and Data System\n(BT-RADS) scores and 2,154 pathology reports annotated for isocitrate\ndehydrogenase (IDH) mutation status. An automated pipeline was developed to\nbenchmark the performance of various LMs and RAG configurations. The impact of\nmodel size, quantization, prompting strategies, output formatting, and\ninference parameters was systematically evaluated. Results: The best performing\nmodels achieved over 98% accuracy in extracting BT-RADS scores from radiology\nreports and over 90% for IDH mutation status extraction from pathology reports.\nThe top model being medical fine-tuned llama3. Larger, newer, and domain\nfine-tuned models consistently outperformed older and smaller models. Model\nquantization had minimal impact on performance. Few-shot prompting\nsignificantly improved accuracy. RAG improved performance for complex pathology\nreports but not for shorter radiology reports. Conclusions: Open LMs\ndemonstrate significant potential for automated extraction of structured\nclinical data from unstructured clinical reports with local privacy-preserving\napplication. Careful model selection, prompt engineering, and semi-automated\noptimization using annotated data are critical for optimal performance. These\napproaches could be reliable enough for practical use in research workflows,\nhighlighting the potential for human-machine collaboration in healthcare data\nextraction.\n","authors":["Mohamed Sobhi Jabal","Pranav Warman","Jikai Zhang","Kartikeye Gupta","Ayush Jain","Maciej Mazurowski","Walter Wiggins","Kirti Magudia","Evan Calabrese"],"pdf_url":"https://arxiv.org/pdf/2409.10576v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.00008v4","updated":"2024-09-18T13:25:52Z","published":"2023-03-27T18:00:01Z","title":"On the Creativity of Large Language Models","summary":" Large Language Models (LLMs) are revolutionizing several areas of Artificial\nIntelligence. One of the most remarkable applications is creative writing,\ne.g., poetry or storytelling: the generated outputs are often of astonishing\nquality. However, a natural question arises: can LLMs be really considered\ncreative? In this article, we first analyze the development of LLMs under the\nlens of creativity theories, investigating the key open questions and\nchallenges. In particular, we focus our discussion on the dimensions of value,\nnovelty, and surprise as proposed by Margaret Boden in her work. Then, we\nconsider different classic perspectives, namely product, process, press, and\nperson. We discuss a set of ``easy'' and ``hard'' problems in machine\ncreativity, presenting them in relation to LLMs. Finally, we examine the\nsocietal impact of these technologies with a particular focus on the creative\nindustries, analyzing the opportunities offered, the challenges arising from\nthem, and the potential associated risks, from both legal and ethical points of\nview.\n","authors":["Giorgio Franceschelli","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2304.00008v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11971v1","updated":"2024-09-18T13:22:04Z","published":"2024-09-18T13:22:04Z","title":"Sampling Latent Material-Property Information From LLM-Derived Embedding\n Representations","summary":" Vector embeddings derived from large language models (LLMs) show promise in\ncapturing latent information from the literature. Interestingly, these can be\nintegrated into material embeddings, potentially useful for data-driven\npredictions of materials properties. We investigate the extent to which\nLLM-derived vectors capture the desired information and their potential to\nprovide insights into material properties without additional training. Our\nfindings indicate that, although LLMs can be used to generate representations\nreflecting certain property information, extracting the embeddings requires\nidentifying the optimal contextual clues and appropriate comparators. Despite\nthis restriction, it appears that LLMs still have the potential to be useful in\ngenerating meaningful materials-science representations.\n","authors":["Luke P. J. Gilligan","Matteo Cobelli","Hasan M. Sayeed","Taylor D. Sparks","Stefano Sanvito"],"pdf_url":"https://arxiv.org/pdf/2409.11971v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.11968v1","updated":"2024-09-18T13:20:23Z","published":"2024-09-18T13:20:23Z","title":"Efficacy of Synthetic Data as a Benchmark","summary":" Large language models (LLMs) have enabled a range of applications in\nzero-shot and few-shot learning settings, including the generation of synthetic\ndatasets for training and testing. However, to reliably use these synthetic\ndatasets, it is essential to understand how representative they are of\nreal-world data. We investigate this by assessing the effectiveness of\ngenerating synthetic data through LLM and using it as a benchmark for various\nNLP tasks. Our experiments across six datasets, and three different tasks, show\nthat while synthetic data can effectively capture performance of various\nmethods for simpler tasks, such as intent classification, it falls short for\nmore complex tasks like named entity recognition. Additionally, we propose a\nnew metric called the bias factor, which evaluates the biases introduced when\nthe same LLM is used to both generate benchmarking data and to perform the\ntasks. We find that smaller LLMs exhibit biases towards their own generated\ndata, whereas larger models do not. Overall, our findings suggest that the\neffectiveness of synthetic data as a benchmark varies depending on the task,\nand that practitioners should rely on data generated from multiple larger\nmodels whenever possible.\n","authors":["Gaurav Maheshwari","Dmitry Ivanov","Kevin El Haddad"],"pdf_url":"https://arxiv.org/pdf/2409.11968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10516v2","updated":"2024-09-18T13:11:13Z","published":"2024-09-16T17:59:52Z","title":"RetrievalAttention: Accelerating Long-Context LLM Inference via Vector\n Retrieval","summary":" Transformer-based Large Language Models (LLMs) have become increasingly\nimportant. However, due to the quadratic time complexity of attention\ncomputation, scaling LLMs to longer contexts incurs extremely slow inference\nlatency and high GPU memory consumption for caching key-value (KV) vectors.\nThis paper proposes RetrievalAttention, a training-free approach to both\naccelerate attention computation and reduce GPU memory consumption. By\nleveraging the dynamic sparsity of attention mechanism, RetrievalAttention\nproposes to use approximate nearest neighbor search (ANNS) indexes for KV\nvectors in CPU memory and retrieves the most relevant ones with vector search\nduring generation. Unfortunately, we observe that the off-the-shelf ANNS\nindexes are often ineffective for such retrieval tasks due to the\nout-of-distribution (OOD) between query vectors and key vectors in attention\nmechanism. RetrievalAttention addresses the OOD challenge by designing an\nattention-aware vector search algorithm that can adapt to the distribution of\nquery vectors. Our evaluation shows that RetrievalAttention only needs to\naccess 1--3% of data while maintaining high model accuracy. This leads to\nsignificant reduction in the inference cost of long-context LLMs with much\nlower GPU memory footprint. In particular, RetrievalAttention only needs a\nsingle NVIDIA RTX4090 (24GB) for serving 128K tokens in LLMs with 8B\nparameters, which is capable of generating one token in 0.188 seconds.\n","authors":["Di Liu","Meng Chen","Baotong Lu","Huiqiang Jiang","Zhenhua Han","Qianxi Zhang","Qi Chen","Chengruidong Zhang","Bailu Ding","Kai Zhang","Chen Chen","Fan Yang","Yuqing Yang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2409.10516v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2409.11917v1","updated":"2024-09-18T12:29:22Z","published":"2024-09-18T12:29:22Z","title":"LLMs in Education: Novel Perspectives, Challenges, and Opportunities","summary":" The role of large language models (LLMs) in education is an increasing area\nof interest today, considering the new opportunities they offer for teaching,\nlearning, and assessment. This cutting-edge tutorial provides an overview of\nthe educational applications of NLP and the impact that the recent advances in\nLLMs have had on this field. We will discuss the key challenges and\nopportunities presented by LLMs, grounding them in the context of four major\neducational applications: reading, writing, and speaking skills, and\nintelligent tutoring systems (ITS). This COLING 2025 tutorial is designed for\nresearchers and practitioners interested in the educational applications of NLP\nand the role LLMs have to play in this area. It is the first of its kind to\naddress this timely topic.\n","authors":["Bashar Alhafni","Sowmya Vajjala","Stefano Bannò","Kaushal Kumar Maurya","Ekaterina Kochmar"],"pdf_url":"https://arxiv.org/pdf/2409.11917v1.pdf","comment":"COLING 2025 Tutorial"},{"id":"http://arxiv.org/abs/2409.11901v1","updated":"2024-09-18T11:54:45Z","published":"2024-09-18T11:54:45Z","title":"LLMs + Persona-Plug = Personalized LLMs","summary":" Personalization plays a critical role in numerous language tasks and\napplications, since users with the same requirements may prefer diverse outputs\nbased on their individual interests. This has led to the development of various\npersonalized approaches aimed at adapting large language models (LLMs) to\ngenerate customized outputs aligned with user preferences. Some of them involve\nfine-tuning a unique personalized LLM for each user, which is too expensive for\nwidespread application. Alternative approaches introduce personalization\ninformation in a plug-and-play manner by retrieving the user's relevant\nhistorical texts as demonstrations. However, this retrieval-based strategy may\nbreak the continuity of the user history and fail to capture the user's overall\nstyles and patterns, hence leading to sub-optimal performance. To address these\nchallenges, we propose a novel personalized LLM model, \\ours{}. It constructs a\nuser-specific embedding for each individual by modeling all her historical\ncontexts through a lightweight plug-in user embedder module. By attaching this\nembedding to the task input, LLMs can better understand and capture user habits\nand preferences, thereby producing more personalized outputs without tuning\ntheir own parameters. Extensive experiments on various tasks in the language\nmodel personalization (LaMP) benchmark demonstrate that the proposed model\nsignificantly outperforms existing personalized LLM approaches.\n","authors":["Jiongnan Liu","Yutao Zhu","Shuting Wang","Xiaochi Wei","Erxue Min","Yu Lu","Shuaiqiang Wang","Dawei Yin","Zhicheng Dou"],"pdf_url":"https://arxiv.org/pdf/2409.11901v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11887v1","updated":"2024-09-18T11:34:28Z","published":"2024-09-18T11:34:28Z","title":"DocMamba: Efficient Document Pre-training with State Space Model","summary":" In recent years, visually-rich document understanding has attracted\nincreasing attention. Transformer-based pre-trained models have become the\nmainstream approach, yielding significant performance gains in this field.\nHowever, the self-attention mechanism's quadratic computational complexity\nhinders their efficiency and ability to process long documents. In this paper,\nwe present DocMamba, a novel framework based on the state space model. It is\ndesigned to reduce computational complexity to linear while preserving global\nmodeling capabilities. To further enhance its effectiveness in document\nprocessing, we introduce the Segment-First Bidirectional Scan (SFBS) to capture\ncontiguous semantic information. Experimental results demonstrate that DocMamba\nachieves new state-of-the-art results on downstream datasets such as FUNSD,\nCORD, and SORIE, while significantly improving speed and reducing memory usage.\nNotably, experiments on the HRDoc confirm DocMamba's potential for length\nextrapolation. The code will be available online.\n","authors":["Pengfei Hu","Zhenrong Zhang","Jiefeng Ma","Shuhang Liu","Jun Du","Jianshu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.11887v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11860v1","updated":"2024-09-18T10:30:50Z","published":"2024-09-18T10:30:50Z","title":"Retrieve, Annotate, Evaluate, Repeat: Leveraging Multimodal LLMs for\n Large-Scale Product Retrieval Evaluation","summary":" Evaluating production-level retrieval systems at scale is a crucial yet\nchallenging task due to the limited availability of a large pool of\nwell-trained human annotators. Large Language Models (LLMs) have the potential\nto address this scaling issue and offer a viable alternative to humans for the\nbulk of annotation tasks. In this paper, we propose a framework for assessing\nthe product search engines in a large-scale e-commerce setting, leveraging\nMultimodal LLMs for (i) generating tailored annotation guidelines for\nindividual queries, and (ii) conducting the subsequent annotation task. Our\nmethod, validated through deployment on a large e-commerce platform,\ndemonstrates comparable quality to human annotations, significantly reduces\ntime and cost, facilitates rapid problem discovery, and provides an effective\nsolution for production-level quality control at scale.\n","authors":["Kasra Hosseini","Thomas Kober","Josip Krapac","Roland Vollgraf","Weiwei Cheng","Ana Peleteiro Ramallo"],"pdf_url":"https://arxiv.org/pdf/2409.11860v1.pdf","comment":"13 pages, 5 figures, 4 Tables"},{"id":"http://arxiv.org/abs/2405.13031v2","updated":"2024-09-18T10:06:58Z","published":"2024-05-16T10:45:43Z","title":"A Robust Autoencoder Ensemble-Based Approach for Anomaly Detection in\n Text","summary":" Anomaly detection (AD) is a fast growing and popular domain among established\napplications like vision and time series. We observe a rich literature for\nthese applications, but anomaly detection in text is only starting to blossom.\nRecently, self-supervised methods with self-attention mechanism have been the\nmost popular choice. While recent works have proposed a working ground for\nbuilding and benchmarking state of the art approaches, we propose two principal\ncontributions in this paper: contextual anomaly contamination and a novel\nensemble-based approach. Our method, Textual Anomaly Contamination (TAC),\nallows to contaminate inlier classes with either independent or contextual\nanomalies. In the literature, it appears that this distinction is not\nperformed. For finding contextual anomalies, we propose RoSAE, a Robust\nSubspace Local Recovery Autoencoder Ensemble. All autoencoders of the ensemble\npresent a different latent representation through local manifold learning.\nBenchmark shows that our approach outperforms recent works on both independent\nand contextual anomalies, while being more robust. We also provide 8 dataset\ncomparison instead of only relying to Reuters and 20 Newsgroups corpora.\n","authors":["Jeremie Pantin","Christophe Marsala"],"pdf_url":"https://arxiv.org/pdf/2405.13031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11022v2","updated":"2024-09-18T10:05:02Z","published":"2024-09-17T09:32:12Z","title":"GEIC: Universal and Multilingual Named Entity Recognition with Large\n Language Models","summary":" Large Language Models (LLMs) have supplanted traditional methods in numerous\nnatural language processing tasks. Nonetheless, in Named Entity Recognition\n(NER), existing LLM-based methods underperform compared to baselines and\nrequire significantly more computational resources, limiting their application.\nIn this paper, we introduce the task of generation-based extraction and\nin-context classification (GEIC), designed to leverage LLMs' prior knowledge\nand self-attention mechanisms for NER tasks. We then propose CascadeNER, a\nuniversal and multilingual GEIC framework for few-shot and zero-shot NER.\nCascadeNER employs model cascading to utilize two small-parameter LLMs to\nextract and classify independently, reducing resource consumption while\nenhancing accuracy. We also introduce AnythingNER, the first NER dataset\nspecifically designed for LLMs, including 8 languages, 155 entity types and a\nnovel dynamic categorization system. Experiments show that CascadeNER achieves\nstate-of-the-art performance on low-resource and fine-grained scenarios,\nincluding CrossNER and FewNERD. Our work is openly accessible.\n","authors":["Hanjun Luo","Yingbin Jin","Xuecheng Liu","Tong Shang","Ruizhe Chen","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2409.11022v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12907v2","updated":"2024-09-18T10:04:57Z","published":"2024-06-12T13:30:48Z","title":"Reconciling Kaplan and Chinchilla Scaling Laws","summary":" Kaplan et al. [2020] (`Kaplan') and Hoffmann et al. [2022] (`Chinchilla')\nstudied the scaling behavior of transformers trained on next-token language\nprediction. These studies produced different estimates for how the number of\nparameters ($N$) and training tokens ($D$) should be set to achieve the lowest\npossible loss for a given compute budget ($C$). Kaplan: $N_\\text{optimal}\n\\propto C^{0.73}$, Chinchilla: $N_\\text{optimal} \\propto C^{0.50}$. This paper\nfinds that much of this discrepancy can be attributed to Kaplan counting\nnon-embedding rather than total parameters, combined with their analysis being\nperformed at small scale. Simulating the Chinchilla study under these\nconditions produces biased scaling coefficients close to Kaplan's. Hence, this\npaper reaffirms Chinchilla's scaling coefficients, by explaining the primary\ncause of Kaplan's original overestimation. As a second contribution, the paper\nexplains differences in the reported relationships between loss and compute.\nThese findings lead us to recommend that future scaling studies use total\nparameters and compute.\n","authors":["Tim Pearce","Jinyeop Song"],"pdf_url":"https://arxiv.org/pdf/2406.12907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11844v1","updated":"2024-09-18T09:55:48Z","published":"2024-09-18T09:55:48Z","title":"MEOW: MEMOry Supervised LLM Unlearning Via Inverted Facts","summary":" Large Language Models (LLMs) can memorize sensitive information, raising\nconcerns about potential misuse. LLM Unlearning, a post-hoc approach to remove\nthis information from trained LLMs, offers a promising solution to mitigate\nthese risks. However, previous practices face three key challenges: 1. Utility:\nsuccessful unlearning often causes catastrophic collapse on unrelated tasks. 2.\nEfficiency: many methods either involve adding similarly sized models, which\nslows down unlearning or inference, or require retain data that are difficult\nto obtain. 3. Robustness: even effective methods may still leak data via\nextraction techniques. To address these challenges, we propose MEOW, a simple\nyet effective gradient descent-based unlearning method. Specifically, we use an\noffline LLM to generate a set of inverted facts. Then, we design a new metric,\nMEMO, to quantify memorization in LLMs. Finally, based on the signals provided\nby MEMO, we select the most appropriate set of inverted facts and finetune the\nmodel based on them. We evaluate MEOW on the commonly used unlearn benchmark,\nToFU, with Llama2-7B-Chat and Phi-1.5B, and test it on both NLU and NLG tasks.\nResults demonstrate significant improvement of MEOW in forget quality without\nsubstantial loss in model utility. Meanwhile, MEOW does not exhibit significant\ndegradation in NLU or NLG capabilities, and there is even a slight improvement\nin NLU performance.\n","authors":["Tianle Gu","Kexin Huang","Ruilin Luo","Yuanqi Yao","Yujiu Yang","Yan Teng","Yingchun Wang"],"pdf_url":"https://arxiv.org/pdf/2409.11844v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12698v2","updated":"2024-09-18T09:47:54Z","published":"2024-04-19T08:06:01Z","title":"Neural Semantic Parsing with Extremely Rich Symbolic Meaning\n Representations","summary":" Current open-domain neural semantics parsers show impressive performance.\nHowever, closer inspection of the symbolic meaning representations they produce\nreveals significant weaknesses: sometimes they tend to merely copy character\nsequences from the source text to form symbolic concepts, defaulting to the\nmost frequent word sense based in the training distribution. By leveraging the\nhierarchical structure of a lexical ontology, we introduce a novel\ncompositional symbolic representation for concepts based on their position in\nthe taxonomical hierarchy. This representation provides richer semantic\ninformation and enhances interpretability. We introduce a neural \"taxonomical\"\nsemantic parser to utilize this new representation system of predicates, and\ncompare it with a standard neural semantic parser trained on the traditional\nmeaning representation format, employing a novel challenge set and evaluation\nmetric for evaluation. Our experimental findings demonstrate that the\ntaxonomical model, trained on much richer and complex meaning representations,\nis slightly subordinate in performance to the traditional model using the\nstandard metrics for evaluation, but outperforms it when dealing with\nout-of-vocabulary concepts. This finding is encouraging for research in\ncomputational semantics that aims to combine data-driven distributional\nmeanings with knowledge-based symbolic representations.\n","authors":["Xiao Zhang","Gosse Bouma","Johan Bos"],"pdf_url":"https://arxiv.org/pdf/2404.12698v2.pdf","comment":"This manuscript has been accepted by Computational Linguistics\n journal on 2024-09-07"},{"id":"http://arxiv.org/abs/2407.14507v3","updated":"2024-09-18T09:25:20Z","published":"2024-07-19T17:59:03Z","title":"Internal Consistency and Self-Feedback in Large Language Models: A\n Survey","summary":" Large language models (LLMs) often exhibit deficient reasoning or generate\nhallucinations. To address these, studies prefixed with \"Self-\" such as\nSelf-Consistency, Self-Improve, and Self-Refine have been initiated. They share\na commonality: involving LLMs evaluating and updating themselves. Nonetheless,\nthese efforts lack a unified perspective on summarization, as existing surveys\npredominantly focus on categorization.\n In this paper, we use a unified perspective of internal consistency, offering\nexplanations for reasoning deficiencies and hallucinations. Internal\nconsistency refers to the consistency in expressions among LLMs' latent,\ndecoding, or response layers based on sampling methodologies. Then, we\nintroduce an effective theoretical framework capable of mining internal\nconsistency, named Self-Feedback. This framework consists of two modules:\nSelf-Evaluation and Self-Update. The former captures internal consistency\nsignals, while the latter leverages the signals to enhance either the model's\nresponse or the model itself. This framework has been employed in numerous\nstudies.\n We systematically classify these studies by tasks and lines of work;\nsummarize relevant evaluation methods and benchmarks; and delve into the\nconcern, \"Does Self-Feedback Really Work?\" We also propose several critical\nviewpoints, including the \"Hourglass Evolution of Internal Consistency\",\n\"Consistency Is (Almost) Correctness\" hypothesis, and \"The Paradox of Latent\nand Explicit Reasoning\". The relevant resources are open-sourced at\nhttps://github.com/IAAR-Shanghai/ICSFSurvey.\n","authors":["Xun Liang","Shichao Song","Zifan Zheng","Hanyu Wang","Qingchen Yu","Xunkai Li","Rong-Hua Li","Yi Wang","Zhonghao Wang","Feiyu Xiong","Zhiyu Li"],"pdf_url":"https://arxiv.org/pdf/2407.14507v3.pdf","comment":"20 pages, 10 figures, 6 tables, 13 equations"},{"id":"http://arxiv.org/abs/2409.11827v1","updated":"2024-09-18T09:21:25Z","published":"2024-09-18T09:21:25Z","title":"Extract-and-Abstract: Unifying Extractive and Abstractive Summarization\n within Single Encoder-Decoder Framework","summary":" Extract-then-Abstract is a naturally coherent paradigm to conduct abstractive\nsummarization with the help of salient information identified by the extractive\nmodel. Previous works that adopt this paradigm train the extractor and\nabstractor separately and introduce extra parameters to highlight the extracted\nsalients to the abstractor, which results in error accumulation and additional\ntraining costs. In this paper, we first introduce a parameter-free highlight\nmethod into the encoder-decoder framework: replacing the encoder attention mask\nwith a saliency mask in the cross-attention module to force the decoder to\nfocus only on salient parts of the input. A preliminary analysis compares\ndifferent highlight methods, demonstrating the effectiveness of our saliency\nmask. We further propose the novel extract-and-abstract paradigm, ExtAbs, which\njointly and seamlessly performs Extractive and Abstractive summarization tasks\nwithin single encoder-decoder model to reduce error accumulation. In ExtAbs,\nthe vanilla encoder is augmented to extract salients, and the vanilla decoder\nis modified with the proposed saliency mask to generate summaries. Built upon\nBART and PEGASUS, experiments on three datasets show that ExtAbs can achieve\nsuperior performance than baselines on the extractive task and performs\ncomparable, or even better than the vanilla models on the abstractive task.\n","authors":["Yuping Wu","Hao Li","Hongbo Zhu","Goran Nenadic","Xiao-Jun Zeng"],"pdf_url":"https://arxiv.org/pdf/2409.11827v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.08841v3","updated":"2024-09-18T09:09:07Z","published":"2022-12-17T10:43:25Z","title":"AugTriever: Unsupervised Dense Retrieval by Scalable Data Augmentation","summary":" Dense retrievers have made significant strides in text retrieval and\nopen-domain question answering. However, most of these achievements have relied\nheavily on extensive human-annotated supervision. In this study, we aim to\ndevelop unsupervised methods for improving dense retrieval models. We propose\ntwo approaches that enable annotation-free and scalable training by creating\npseudo querydocument pairs: query extraction and transferred query generation.\nThe query extraction method involves selecting salient spans from the original\ndocument to generate pseudo queries. On the other hand, the transferred query\ngeneration method utilizes generation models trained for other NLP tasks, such\nas summarization, to produce pseudo queries. Through extensive experimentation,\nwe demonstrate that models trained using these augmentation methods can achieve\ncomparable, if not better, performance than multiple strong dense baselines.\nMoreover, combining these strategies leads to further improvements, resulting\nin superior performance of unsupervised dense retrieval, unsupervised domain\nadaptation and supervised finetuning, benchmarked on both BEIR and ODQA\ndatasets. Code and datasets are publicly available at\nhttps://github.com/salesforce/AugTriever.\n","authors":["Rui Meng","Ye Liu","Semih Yavuz","Divyansh Agarwal","Lifu Tu","Ning Yu","Jianguo Zhang","Meghana Bhat","Yingbo Zhou"],"pdf_url":"https://arxiv.org/pdf/2212.08841v3.pdf","comment":"DCAI24, October 25, 2024, Boise, ID"},{"id":"http://arxiv.org/abs/2409.11798v1","updated":"2024-09-18T08:30:20Z","published":"2024-09-18T08:30:20Z","title":"The Factuality of Large Language Models in the Legal Domain","summary":" This paper investigates the factuality of large language models (LLMs) as\nknowledge bases in the legal domain, in a realistic usage scenario: we allow\nfor acceptable variations in the answer, and let the model abstain from\nanswering when uncertain. First, we design a dataset of diverse factual\nquestions about case law and legislation. We then use the dataset to evaluate\nseveral LLMs under different evaluation methods, including exact, alias, and\nfuzzy matching. Our results show that the performance improves significantly\nunder the alias and fuzzy matching methods. Further, we explore the impact of\nabstaining and in-context examples, finding that both strategies enhance\nprecision. Finally, we demonstrate that additional pre-training on legal\ndocuments, as seen with SaulLM, further improves factual precision from 63% to\n81%.\n","authors":["Rajaa El Hamdani","Thomas Bonald","Fragkiskos Malliaros","Nils Holzenberger","Fabian Suchanek"],"pdf_url":"https://arxiv.org/pdf/2409.11798v1.pdf","comment":"CIKM 2024, short paper"},{"id":"http://arxiv.org/abs/2407.01272v3","updated":"2024-09-18T07:58:55Z","published":"2024-07-01T13:25:33Z","title":"Show Less, Instruct More: Enriching Prompts with Definitions and\n Guidelines for Zero-Shot NER","summary":" Recently, several specialized instruction-tuned Large Language Models (LLMs)\nfor Named Entity Recognition (NER) have emerged. Compared to traditional NER\napproaches, these models have demonstrated strong generalization capabilities.\nExisting LLMs primarily focus on addressing zero-shot NER on Out-of-Domain\ninputs, while fine-tuning on an extensive number of entity classes that often\nhighly or completely overlap with test sets. In this work instead, we propose\nSLIMER, an approach designed to tackle never-seen-before entity tags by\ninstructing the model on fewer examples, and by leveraging a prompt enriched\nwith definition and guidelines. Experiments demonstrate that definition and\nguidelines yield better performance, faster and more robust learning,\nparticularly when labelling unseen named entities. Furthermore, SLIMER performs\ncomparably to state-of-the-art approaches in out-of-domain zero-shot NER, while\nbeing trained in a more fair, though certainly more challenging, setting.\n","authors":["Andrew Zamai","Andrea Zugarini","Leonardo Rigutini","Marco Ernandes","Marco Maggini"],"pdf_url":"https://arxiv.org/pdf/2407.01272v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.04464v2","updated":"2024-09-18T07:43:12Z","published":"2024-09-03T07:25:01Z","title":"Leveraging Large Language Models for Solving Rare MIP Challenges","summary":" Mixed Integer Programming (MIP) has been extensively applied in areas\nrequiring mathematical solvers to address complex instances within tight time\nconstraints. However, as the problem scale increases, the complexity of model\nformulation and finding feasible solutions escalates significantly. In\ncontrast, the model-building cost for end-to-end models, such as large language\nmodels (LLMs), remains largely unaffected by problem scale due to their pattern\nrecognition capabilities. While LLMs, like GPT-4, without fine-tuning, can\nhandle some traditional medium-scale MIP problems, they struggle with uncommon\nor highly specialized MIP scenarios. Fine-tuning LLMs can yield some feasible\nsolutions for medium-scale MIP instances, but these models typically fail to\nexplore diverse solutions when constrained by a low and constant temperature,\nlimiting their performance. In this paper, we propose and evaluate a\nrecursively dynamic temperature method integrated with a chain-of-thought\napproach. Our findings show that starting with a high temperature and gradually\nlowering it leads to better feasible solutions compared to other dynamic\ntemperature strategies. Additionally, by comparing results generated by the LLM\nwith those from Gurobi, we demonstrate that the LLM can produce solutions that\ncomplement traditional solvers by accelerating the pruning process and\nimproving overall efficiency.\n","authors":["Teng Wang","Wing-Yin Yu","Ruifeng She","Wenhan Yang","Taijie Chen","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.04464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10640v2","updated":"2024-09-18T07:35:46Z","published":"2024-09-16T18:15:28Z","title":"Exploring Fine-tuned Generative Models for Keyphrase Selection: A Case\n Study for Russian","summary":" Keyphrase selection plays a pivotal role within the domain of scholarly\ntexts, facilitating efficient information retrieval, summarization, and\nindexing. In this work, we explored how to apply fine-tuned generative\ntransformer-based models to the specific task of keyphrase selection within\nRussian scientific texts. We experimented with four distinct generative models,\nsuch as ruT5, ruGPT, mT5, and mBART, and evaluated their performance in both\nin-domain and cross-domain settings. The experiments were conducted on the\ntexts of Russian scientific abstracts from four domains: mathematics & computer\nscience, history, medicine, and linguistics. The use of generative models,\nnamely mBART, led to gains in in-domain performance (up to 4.9% in BERTScore,\n9.0% in ROUGE-1, and 12.2% in F1-score) over three keyphrase extraction\nbaselines for the Russian language. Although the results for cross-domain usage\nwere significantly lower, they still demonstrated the capability to surpass\nbaseline performances in several cases, underscoring the promising potential\nfor further exploration and refinement in this research field.\n","authors":["Anna Glazkova","Dmitry Morozov"],"pdf_url":"https://arxiv.org/pdf/2409.10640v2.pdf","comment":"DAMDID-2024"},{"id":"http://arxiv.org/abs/2409.10927v2","updated":"2024-09-18T07:23:50Z","published":"2024-09-17T06:51:59Z","title":"Propulsion: Steering LLM with Tiny Fine-Tuning","summary":" The rapid advancements in Large Language Models (LLMs) have revolutionized\nnatural language processing (NLP) and related fields. However, fine-tuning\nthese models for specific tasks remains computationally expensive and risks\ndegrading pre-learned features. To address these challenges, we propose\nPropulsion, a novel parameter efficient fine-tuning (PEFT) method designed to\noptimize task-specific performance while drastically reducing computational\noverhead. Inspired by the concept of controlled adjustments in physical motion,\nPropulsion selectively re-scales specific dimensions of a pre-trained model,\nguiding output predictions toward task objectives without modifying the model's\nparameters. By introducing lightweight, trainable Propulsion parameters at the\npre-trained layer, we minimize the number of parameters updated during\nfine-tuning, preventing overfitting or overwriting of existing knowledge. Our\ntheoretical analysis, supported by Neural Tangent Kernel (NTK) theory, shows\nthat Propulsion approximates the performance of full fine-tuning with far fewer\ntrainable parameters. Empirically, Propulsion reduces the parameter count from\n355.3 million to just 0.086 million, achieving over a 10x reduction compared to\nstandard approaches like LoRA while maintaining competitive performance across\nbenchmarks.\n","authors":["Md Kowsher","Nusrat Jahan Prottasha","Prakash Bhat"],"pdf_url":"https://arxiv.org/pdf/2409.10927v2.pdf","comment":"26 pages, 11 figures"},{"id":"http://arxiv.org/abs/2409.11041v2","updated":"2024-09-18T07:17:17Z","published":"2024-09-17T10:04:50Z","title":"Towards No-Code Programming of Cobots: Experiments with Code Synthesis\n by Large Code Models for Conversational Programming","summary":" While there has been a lot of research recently on robots in household\nenvironments, at the present time, most robots in existence can be found on\nshop floors, and most interactions between humans and robots happen there.\n``Collaborative robots'' (cobots) designed to work alongside humans on assembly\nlines traditionally require expert programming, limiting ability to make\nchanges, or manual guidance, limiting expressivity of the resulting programs.\nTo address these limitations, we explore using Large Language Models (LLMs),\nand in particular, their abilities of doing in-context learning, for\nconversational code generation. As a first step, we define RATS, the\n``Repetitive Assembly Task'', a 2D building task designed to lay the foundation\nfor simulating industry assembly scenarios. In this task, a `programmer'\ninstructs a cobot, using natural language, on how a certain assembly is to be\nbuilt; that is, the programmer induces a program, through natural language. We\ncreate a dataset that pairs target structures with various example instructions\n(human-authored, template-based, and model-generated) and example code. With\nthis, we systematically evaluate the capabilities of state-of-the-art LLMs for\nsynthesising this kind of code, given in-context examples. Evaluating in a\nsimulated environment, we find that LLMs are capable of generating accurate\n`first order code' (instruction sequences), but have problems producing\n`higher-order code' (abstractions such as functions, or use of loops).\n","authors":["Chalamalasetti Kranti","Sherzod Hakimov","David Schlangen"],"pdf_url":"https://arxiv.org/pdf/2409.11041v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.06787v2","updated":"2024-09-18T07:12:28Z","published":"2024-08-13T10:15:55Z","title":"Unlock the Power of Frozen LLMs in Knowledge Graph Completion","summary":" Traditional knowledge graph completion (KGC) methods rely solely on\nstructural information, struggling with the inherent sparsity of knowledge\ngraphs (KGs). Large Language Models (LLMs) learn extensive knowledge from large\ncorpora with powerful context modeling, making them promising for mitigating\nthe limitations of previous methods. Directly fine-tuning LLMs offers great\ncapability but comes at the cost of huge time and memory consumption, while\nutilizing frozen LLMs yields suboptimal results.In this work, we aim to\nleverage LLMs for KGC effectively and efficiently. We capture the context-aware\nhidden states of knowledge triples by employing prompts to stimulate the\nintermediate layers of LLMs. We then train a data-efficient classifier on these\nhidden states to harness the inherent capabilities of frozen LLMs in KGC.\nAdditionally, to reduce ambiguity and enrich knowledge representation, we\ngenerate detailed entity descriptions through subgraph sampling on KGs.\nExtensive experiments on standard benchmarks demonstrate the efficiency and\neffectiveness of our approach. We outperform traditional KGC methods across\nmost datasets and, notably, achieve classification performance comparable to\nfine-tuned LLMs while enhancing GPU memory efficiency by $188\\times$ and\naccelerating training and inference by $13.48\\times$.\n","authors":["Bo Xue","Yi Xu","Yunchong Song","Yiming Pang","Yuyang Ren","Jiaxin Ding","Luoyi Fu","Xinbing Wang"],"pdf_url":"https://arxiv.org/pdf/2408.06787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.07970v3","updated":"2024-09-18T07:06:02Z","published":"2024-06-12T07:49:36Z","title":"Guiding In-Context Learning of LLMs through Quality Estimation for\n Machine Translation","summary":" The quality of output from large language models (LLMs), particularly in\nmachine translation (MT), is closely tied to the quality of in-context examples\n(ICEs) provided along with the query, i.e., the text to translate. The\neffectiveness of these ICEs is influenced by various factors, such as the\ndomain of the source text, the order in which the ICEs are presented, the\nnumber of these examples, and the prompt templates used. Naturally, selecting\nthe most impactful ICEs depends on understanding how these affect the resulting\ntranslation quality, which ultimately relies on translation references or human\njudgment. This paper presents a novel methodology for in-context learning (ICL)\nthat relies on a search algorithm guided by domain-specific quality estimation\n(QE). Leveraging the XGLM model, our methodology estimates the resulting\ntranslation quality without the need for translation references, selecting\neffective ICEs for MT to maximize translation quality. Our results demonstrate\nsignificant improvements over existing ICL methods and higher translation\nperformance compared to fine-tuning a pre-trained language model (PLM),\nspecifically mBART-50.\n","authors":["Javad Pourmostafa Roshan Sharami","Dimitar Shterionov","Pieter Spronck"],"pdf_url":"https://arxiv.org/pdf/2406.07970v3.pdf","comment":"Camera-ready version of the paper for the Association for Machine\n Translation in the Americas (AMTA), including the link to the paper's\n repository"},{"id":"http://arxiv.org/abs/2409.11727v1","updated":"2024-09-18T06:27:26Z","published":"2024-09-18T06:27:26Z","title":"Enabling Real-Time Conversations with Minimal Training Costs","summary":" Large language models (LLMs) have demonstrated the ability to improve human\nefficiency through conversational interactions. Conventional LLM-powered\ndialogue systems, operating on a turn-based paradigm, preclude real-time\ninteraction during response generation. To address this limitation, researchers\nhave proposed duplex models. These models can dynamically adapt to user input,\nfacilitating real-time interactive feedback. However, these methods typically\nrequire substantial computational resources to acquire the ability. To reduce\noverhead, this paper presents a new duplex decoding approach that enhances LLMs\nwith duplex ability, requiring minimal additional training. Specifically, our\nmethod employs parallel decoding of queries and responses in conversations,\neffectively implementing a channel-division-multiplexing decoding strategy.\nExperimental results indicate that our proposed method significantly enhances\nthe naturalness and human-likeness of user-AI interactions with minimal\ntraining costs.\n","authors":["Wang Xu","Shuo Wang","Weilin Zhao","Xu Han","Yukun Yan","Yudi Zhang","Zhe Tao","Zhiyuan Liu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2409.11727v1.pdf","comment":"7pages, 6 figures, 1 table"},{"id":"http://arxiv.org/abs/2409.11726v1","updated":"2024-09-18T06:21:44Z","published":"2024-09-18T06:21:44Z","title":"Revealing the Challenge of Detecting Character Knowledge Errors in LLM\n Role-Playing","summary":" Large language model (LLM) role-playing has gained widespread attention,\nwhere the authentic character knowledge is crucial for constructing realistic\nLLM role-playing agents. However, existing works usually overlook the\nexploration of LLMs' ability to detect characters' known knowledge errors (KKE)\nand unknown knowledge errors (UKE) while playing roles, which would lead to\nlow-quality automatic construction of character trainable corpus. In this\npaper, we propose a probing dataset to evaluate LLMs' ability to detect errors\nin KKE and UKE. The results indicate that even the latest LLMs struggle to\neffectively detect these two types of errors, especially when it comes to\nfamiliar knowledge. We experimented with various reasoning strategies and\npropose an agent-based reasoning method, Self-Recollection and Self-Doubt\n(S2RD), to further explore the potential for improving error detection\ncapabilities. Experiments show that our method effectively improves the LLMs'\nability to detect error character knowledge, but it remains an issue that\nrequires ongoing attention.\n","authors":["Wenyuan Zhang","Jiawei Sheng","Shuaiyi Nie","Zefeng Zhang","Xinghua Zhang","Yongquan He","Tingwen Liu"],"pdf_url":"https://arxiv.org/pdf/2409.11726v1.pdf","comment":"22 pages, 14 figures"},{"id":"http://arxiv.org/abs/2409.11724v1","updated":"2024-09-18T06:19:59Z","published":"2024-09-18T06:19:59Z","title":"TART: An Open-Source Tool-Augmented Framework for Explainable\n Table-based Reasoning","summary":" Current Large Language Models (LLMs) exhibit limited ability to understand\ntable structures and to apply precise numerical reasoning, which is crucial for\ntasks such as table question answering (TQA) and table-based fact verification\n(TFV). To address these challenges, we introduce our Tool-Augmented Reasoning\nframework for Tables (TART), which integrates LLMs with specialized tools. TART\ncontains three key components: a table formatter to ensure accurate data\nrepresentation, a tool maker to develop specific computational tools, and an\nexplanation generator to maintain explainability. We also present the TOOLTAB\ndataset, a new benchmark designed specifically for training LLMs in table-tool\nintegration. Our experiments indicate that TART achieves substantial\nimprovements over existing methods (e.g., Chain-of-Thought) by improving both\nthe precision of data processing and the clarity of the reasoning process.\nNotably, TART paired with CodeLlama achieves 90.0% of the accuracy of the\nclosed-sourced LLM GPT-3.5-turbo, highlighting its robustness in diverse\nreal-world scenarios. All the code and data are available at\nhttps://github.com/XinyuanLu00/TART.\n","authors":["Xinyuan Lu","Liangming Pan","Yubo Ma","Preslav Nakov","Min-Yen Kan"],"pdf_url":"https://arxiv.org/pdf/2409.11724v1.pdf","comment":"technical report"},{"id":"http://arxiv.org/abs/2409.11283v2","updated":"2024-09-18T05:42:01Z","published":"2024-09-17T15:38:36Z","title":"Zero-resource Hallucination Detection for Text Generation via\n Graph-based Contextual Knowledge Triples Modeling","summary":" LLMs obtain remarkable performance but suffer from hallucinations. Most\nresearch on detecting hallucination focuses on the questions with short and\nconcrete correct answers that are easy to check the faithfulness. Hallucination\ndetections for text generation with open-ended answers are more challenging.\nSome researchers use external knowledge to detect hallucinations in generated\ntexts, but external resources for specific scenarios are hard to access. Recent\nstudies on detecting hallucinations in long text without external resources\nconduct consistency comparison among multiple sampled outputs. To handle long\ntexts, researchers split long texts into multiple facts and individually\ncompare the consistency of each pairs of facts. However, these methods (1)\nhardly achieve alignment among multiple facts; (2) overlook dependencies\nbetween multiple contextual facts. In this paper, we propose a graph-based\ncontext-aware (GCA) hallucination detection for text generations, which aligns\nknowledge facts and considers the dependencies between contextual knowledge\ntriples in consistency comparison. Particularly, to align multiple facts, we\nconduct a triple-oriented response segmentation to extract multiple knowledge\ntriples. To model dependencies among contextual knowledge triple (facts), we\nconstruct contextual triple into a graph and enhance triples' interactions via\nmessage passing and aggregating via RGCN. To avoid the omission of knowledge\ntriples in long text, we conduct a LLM-based reverse verification via\nreconstructing the knowledge triples. Experiments show that our model enhances\nhallucination detection and excels all baselines.\n","authors":["Xinyue Fang","Zhen Huang","Zhiliang Tian","Minghui Fang","Ziyi Pan","Quntian Fang","Zhihua Wen","Hengyue Pan","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2409.11283v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.00978v2","updated":"2024-09-18T05:28:12Z","published":"2024-04-01T07:49:11Z","title":"Prior Constraints-based Reward Model Training for Aligning Large\n Language Models","summary":" Reinforcement learning with human feedback for aligning large language models\n(LLMs) trains a reward model typically using ranking loss with comparison\npairs.However, the training procedure suffers from an inherent problem: the\nuncontrolled scaling of reward scores during reinforcement learning due to the\nlack of constraints while training the reward model.This paper proposes a Prior\nConstraints-based Reward Model (namely PCRM) training method to mitigate this\nproblem. PCRM incorporates prior constraints, specifically, length ratio and\ncosine similarity between outputs of each comparison pair, during reward model\ntraining to regulate optimization magnitude and control score margins. We\ncomprehensively evaluate PCRM by examining its rank correlation with human\npreferences and its effectiveness in aligning LLMs via RL. Experimental results\ndemonstrate that PCRM significantly improves alignment performance by\neffectively constraining reward score scaling. As another bonus, our method is\neasily integrated into arbitrary rank-based alignment methods, such as direct\npreference optimization, and can yield consistent improvement.\n","authors":["Hang Zhou","Chenglong Wang","Yimin Hu","Tong Xiao","Chunliang Zhang","Jingbo Zhu"],"pdf_url":"https://arxiv.org/pdf/2404.00978v2.pdf","comment":"Accepted by CCL 2024"},{"id":"http://arxiv.org/abs/2409.11704v1","updated":"2024-09-18T05:13:18Z","published":"2024-09-18T05:13:18Z","title":"From Lists to Emojis: How Format Bias Affects Model Alignment","summary":" In this paper, we study format biases in reinforcement learning from human\nfeedback (RLHF). We observe that many widely-used preference models, including\nhuman evaluators, GPT-4, and top-ranking models on the RewardBench benchmark,\nexhibit strong biases towards specific format patterns, such as lists, links,\nbold text, and emojis. Furthermore, large language models (LLMs) can exploit\nthese biases to achieve higher rankings on popular benchmarks like AlpacaEval\nand LMSYS Chatbot Arena. One notable example of this is verbosity bias, where\ncurrent preference models favor longer responses that appear more\ncomprehensive, even when their quality is equal to or lower than shorter,\ncompeting responses. However, format biases beyond verbosity remain largely\nunderexplored in the literature. In this work, we extend the study of biases in\npreference learning beyond the commonly recognized length bias, offering a\ncomprehensive analysis of a wider range of format biases. Additionally, we show\nthat with a small amount of biased data (less than 1%), we can inject\nsignificant bias into the reward model. Moreover, these format biases can also\nbe easily exploited by downstream alignment algorithms, such as best-of-n\nsampling and online iterative DPO, as it is usually easier to manipulate the\nformat than to improve the quality of responses. Our findings emphasize the\nneed to disentangle format and content both for designing alignment algorithms\nand evaluating models.\n","authors":["Xuanchang Zhang","Wei Xiong","Lichang Chen","Tianyi Zhou","Heng Huang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.11704v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2409.11703v1","updated":"2024-09-18T04:56:52Z","published":"2024-09-18T04:56:52Z","title":"Harnessing LLMs for API Interactions: A Framework for Classification and\n Synthetic Data Generation","summary":" As Large Language Models (LLMs) advance in natural language processing, there\nis growing interest in leveraging their capabilities to simplify software\ninteractions. In this paper, we propose a novel system that integrates LLMs for\nboth classifying natural language inputs into corresponding API calls and\nautomating the creation of sample datasets tailored to specific API functions.\nBy classifying natural language commands, our system allows users to invoke\ncomplex software functionalities through simple inputs, improving interaction\nefficiency and lowering the barrier to software utilization. Our dataset\ngeneration approach also enables the efficient and systematic evaluation of\ndifferent LLMs in classifying API calls, offering a practical tool for\ndevelopers or business owners to assess the suitability of LLMs for customized\nAPI management. We conduct experiments on several prominent LLMs using\ngenerated sample datasets for various API functions. The results show that\nGPT-4 achieves a high classification accuracy of 0.996, while LLaMA-3-8B\nperforms much worse at 0.759. These findings highlight the potential of LLMs to\ntransform API management and validate the effectiveness of our system in\nguiding model testing and selection across diverse applications.\n","authors":["Chunliang Tao","Xiaojing Fan","Yahe Yang"],"pdf_url":"https://arxiv.org/pdf/2409.11703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.05821v3","updated":"2024-09-18T04:53:46Z","published":"2023-12-10T08:41:24Z","title":"ASVD: Activation-aware Singular Value Decomposition for Compressing\n Large Language Models","summary":" In this paper, we introduce a new post-training compression paradigm for\nLarge Language Models (LLMs) to facilitate their wider adoption. We delve into\nLLM weight low-rank factorization, and find that the challenges of this task\nstem from the outlier phenomenon in the LLM activations and the sensitivity\ndifference among various kinds of layers. To address these issues, we propose a\ntraining-free approach called Activation-aware Singular Value Decomposition\n(ASVD). Specifically, ASVD manages activation outliers by scaling the weight\nmatrix based on the activation distribution, thereby enhancing decomposition\naccuracy. Additionally, we propose an efficient iterative calibration process\nto optimize layer-specific decomposition by addressing the varying sensitivity\nof different LLM layers. ASVD can compress a network by 10-20%, without\ncompromising the performance of LLMs. Based on the success of the low-rank\ndecomposition of projection matrices in the self-attention module, we further\nintroduce ASVD to compress the KV cache. By reducing the channel dimension of\nKV activations, memory requirements for KV cache can be largely reduced. Thanks\nto the 50-75% reduction in the rank of the KV projection matrices, ASVD can\nfurther achieve 50% KV cache reductions without performance drop in a\ntraining-free manner.\n","authors":["Zhihang Yuan","Yuzhang Shang","Yue Song","Qiang Wu","Yan Yan","Guangyu Sun"],"pdf_url":"https://arxiv.org/pdf/2312.05821v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11699v1","updated":"2024-09-18T04:43:41Z","published":"2024-09-18T04:43:41Z","title":"FLARE: Fusing Language Models and Collaborative Architectures for\n Recommender Enhancement","summary":" Hybrid recommender systems, combining item IDs and textual descriptions,\noffer potential for improved accuracy. However, previous work has largely\nfocused on smaller datasets and model architectures. This paper introduces\nFlare (Fusing Language models and collaborative Architectures for Recommender\nEnhancement), a novel hybrid recommender that integrates a language model (mT5)\nwith a collaborative filtering model (Bert4Rec) using a Perceiver network. This\narchitecture allows Flare to effectively combine collaborative and content\ninformation for enhanced recommendations.\n We conduct a two-stage evaluation, first assessing Flare's performance\nagainst established baselines on smaller datasets, where it demonstrates\ncompetitive accuracy. Subsequently, we evaluate Flare on a larger, more\nrealistic dataset with a significantly larger item vocabulary, introducing new\nbaselines for this setting. Finally, we showcase Flare's inherent ability to\nsupport critiquing, enabling users to provide feedback and refine\nrecommendations. We further leverage critiquing as an evaluation method to\nassess the model's language understanding and its transferability to the\nrecommendation task.\n","authors":["Liam Hebert","Marialena Kyriakidi","Hubert Pham","Krishna Sayana","James Pine","Sukhdeep Sodhi","Ambarish Jash"],"pdf_url":"https://arxiv.org/pdf/2409.11699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11677v1","updated":"2024-09-18T03:32:25Z","published":"2024-09-18T03:32:25Z","title":"Enhancing Complex Formula Recognition with Hierarchical Detail-Focused\n Network","summary":" Hierarchical and complex Mathematical Expression Recognition (MER) is\nchallenging due to multiple possible interpretations of a formula, complicating\nboth parsing and evaluation. In this paper, we introduce the Hierarchical\nDetail-Focused Recognition dataset (HDR), the first dataset specifically\ndesigned to address these issues. It consists of a large-scale training set,\nHDR-100M, offering an unprecedented scale and diversity with one hundred\nmillion training instances. And the test set, HDR-Test, includes multiple\ninterpretations of complex hierarchical formulas for comprehensive model\nperformance evaluation. Additionally, the parsing of complex formulas often\nsuffers from errors in fine-grained details. To address this, we propose the\nHierarchical Detail-Focused Recognition Network (HDNet), an innovative\nframework that incorporates a hierarchical sub-formula module, focusing on the\nprecise handling of formula details, thereby significantly enhancing MER\nperformance. Experimental results demonstrate that HDNet outperforms existing\nMER models across various datasets.\n","authors":["Jiale Wang","Junhui Yu","Huanyong Liu","Chenanran Kong"],"pdf_url":"https://arxiv.org/pdf/2409.11677v1.pdf","comment":"Submitted to the 2025 IEEE International Conference on Acoustics,\n Speech, and Signal Processing (ICASSP 2025)"},{"id":"http://arxiv.org/abs/2409.11673v1","updated":"2024-09-18T03:20:04Z","published":"2024-09-18T03:20:04Z","title":"RUIE: Retrieval-based Unified Information Extraction using Large\n Language Model","summary":" Unified information extraction (UIE) aims to complete all information\nextraction tasks using a single model or framework. While previous work has\nprimarily focused on instruction-tuning large language models (LLMs) with\nconstructed datasets, these methods require significant computational resources\nand struggle to generalize to unseen tasks. To address these limitations, we\npropose RUIE (Retrieval-based Unified Information Extraction), a framework that\nleverages in-context learning to enable rapid generalization while reducing\ncomputational costs. The key challenge in RUIE is selecting the most beneficial\ndemonstrations for LLMs to effectively handle diverse IE tasks. To achieve\nthis, we integrate LLM preferences for ranking candidate demonstrations and\ndesign a keyword-enhanced reward model to capture fine-grained relationships\nbetween queries and demonstrations. We then train a bi-encoder retriever for\nUIE through contrastive learning and knowledge distillation. To the best of our\nknowledge, RUIE is the first trainable retrieval framework for UIE.\nExperimental results on 8 held-out datasets demonstrate RUIE's effectiveness in\ngeneralizing to unseen tasks, with average F1-score improvements of 19.22 and\n3.13 compared to instruction-tuning methods and other retrievers, respectively.\nFurther analysis confirms RUIE's adaptability to LLMs of varying sizes and the\nimportance of its key components.\n","authors":["Xincheng Liao","Junwen Duan","Yixi Huang","Jianxin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.11673v1.pdf","comment":"14 pages, 3 figures"},{"id":"http://arxiv.org/abs/2406.09044v2","updated":"2024-09-18T02:57:12Z","published":"2024-06-13T12:30:02Z","title":"MiLoRA: Harnessing Minor Singular Components for Parameter-Efficient LLM\n Finetuning","summary":" Efficient finetuning of large language models (LLMs) aims to adapt the LLMs\nwith reduced computational and memory cost. Previous LoRA-based approaches\ninitialize the low-rank matrices with Gaussian distribution and zero values\nwhile keeping the original weight matrices frozen. However, the trainable model\nparameters optimized in an unguided subspace might interfere with the\nwell-learned subspace of the pretrained weight matrices. In this paper, we\npropose MiLoRA, a simple yet effective LLM finetuning approach that only\nupdates the minor singular components of the weight matrix while keeping the\nprincipal singular components frozen. It is observed that the minor matrix\ncorresponds to the noisy or long-tail information, while the principal matrix\ncontains important knowledge. The MiLoRA initializes the low-rank matrices\nwithin a subspace that is orthogonal to the principal matrix, thus the\npretrained knowledge is expected to be well preserved. During finetuning,\nMiLoRA makes the most use of the less-optimized subspace for learning the\nlabeled dataset. Extensive experiments on commonsense reasoning, math\nreasoning, instruction following and visual instruction following benchmarks\npresent the superior performance of our method.\n","authors":["Hanqing Wang","Yixia Li","Shuo Wang","Guanhua Chen","Yun Chen"],"pdf_url":"https://arxiv.org/pdf/2406.09044v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.10666v5","updated":"2024-09-18T02:48:25Z","published":"2024-02-16T13:14:35Z","title":"MURRE: Multi-Hop Table Retrieval with Removal for Open-Domain\n Text-to-SQL","summary":" The open-domain text-to-SQL task aims to retrieve question-relevant tables\nfrom massive databases and generate SQL. However, the performance of current\nmethods is constrained by single-hop retrieval, and existing multi-hop\nretrieval of open-domain question answering is not directly applicable due to\nthe tendency to retrieve tables similar to the retrieved ones but irrelevant to\nthe question. Since the questions in text-to-SQL usually contain all required\ninformation, while previous multi-hop retrieval supplements the questions with\nretrieved documents. Therefore, we propose the multi-hop table retrieval with\nremoval (MURRE), which removes previously retrieved information from the\nquestion to guide the retriever towards unretrieved relevant tables. Our\nexperiments on two open-domain text-to-SQL datasets demonstrate an average\nimprovement of 5.7% over the previous state-of-the-art results.\n","authors":["Xuanliang Zhang","Dingzirui Wang","Longxu Dou","Qingfu Zhu","Wanxiang Che"],"pdf_url":"https://arxiv.org/pdf/2402.10666v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.02657v3","updated":"2024-09-18T02:31:12Z","published":"2024-04-03T11:40:17Z","title":"Rethinking Kullback-Leibler Divergence in Knowledge Distillation for\n Large Language Models","summary":" Kullback-Leiber divergence has been widely used in Knowledge Distillation\n(KD) to compress Large Language Models (LLMs). Contrary to prior assertions\nthat reverse Kullback-Leibler (RKL) divergence is mode-seeking and thus\npreferable over the mean-seeking forward Kullback-Leibler (FKL) divergence,\nthis study empirically and theoretically demonstrates that neither mode-seeking\nnor mean-seeking properties manifest in KD for LLMs. Instead, RKL and FKL are\nfound to share the same optimization objective and both converge after a\nsufficient number of epochs. However, due to practical constraints, LLMs are\nseldom trained for such an extensive number of epochs. Meanwhile, we further\nfind that RKL focuses on the tail part of the distributions, while FKL focuses\non the head part at the beginning epochs. Consequently, we propose a simple yet\neffective Adaptive Kullback-Leiber (AKL) divergence method, which adaptively\nallocates weights to combine FKL and RKL. Metric-based and GPT-4-based\nevaluations demonstrate that the proposed AKL outperforms the baselines across\nvarious tasks and improves the diversity and quality of generated responses.\n","authors":["Taiqiang Wu","Chaofan Tao","Jiahao Wang","Runming Yang","Zhe Zhao","Ngai Wong"],"pdf_url":"https://arxiv.org/pdf/2404.02657v3.pdf","comment":"working in progress, code available at\n https://github.com/wutaiqiang/LLM_KD_AKL"},{"id":"http://arxiv.org/abs/2409.11638v1","updated":"2024-09-18T02:02:30Z","published":"2024-09-18T02:02:30Z","title":"BanStereoSet: A Dataset to Measure Stereotypical Social Biases in LLMs\n for Bangla","summary":" This study presents BanStereoSet, a dataset designed to evaluate\nstereotypical social biases in multilingual LLMs for the Bangla language. In an\neffort to extend the focus of bias research beyond English-centric datasets, we\nhave localized the content from the StereoSet, IndiBias, and Kamruzzaman et.\nal.'s datasets, producing a resource tailored to capture biases prevalent\nwithin the Bangla-speaking community. Our BanStereoSet dataset consists of\n1,194 sentences spanning 9 categories of bias: race, profession, gender,\nageism, beauty, beauty in profession, region, caste, and religion. This dataset\nnot only serves as a crucial tool for measuring bias in multilingual LLMs but\nalso facilitates the exploration of stereotypical bias across different social\ncategories, potentially guiding the development of more equitable language\ntechnologies in Bangladeshi contexts. Our analysis of several language models\nusing this dataset indicates significant biases, reinforcing the necessity for\nculturally and linguistically adapted datasets to develop more equitable\nlanguage technologies.\n","authors":["Mahammed Kamruzzaman","Abdullah Al Monsur","Shrabon Das","Enamul Hassan","Gene Louis Kim"],"pdf_url":"https://arxiv.org/pdf/2409.11638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11636v1","updated":"2024-09-18T01:56:34Z","published":"2024-09-18T01:56:34Z","title":"\"A Woman is More Culturally Knowledgeable than A Man?\": The Effect of\n Personas on Cultural Norm Interpretation in LLMs","summary":" As the deployment of large language models (LLMs) expands, there is an\nincreasing demand for personalized LLMs. One method to personalize and guide\nthe outputs of these models is by assigning a persona -- a role that describes\nthe expected behavior of the LLM (e.g., a man, a woman, an engineer). This\nstudy investigates whether an LLM's understanding of social norms varies across\nassigned personas. Ideally, the perception of a social norm should remain\nconsistent regardless of the persona, since acceptability of a social norm\nshould be determined by the region the norm originates from, rather than by\nindividual characteristics such as gender, body size, or race. A norm is\nuniversal within its cultural context. In our research, we tested 36 distinct\npersonas from 12 sociodemographic categories (e.g., age, gender, beauty) across\nfour different LLMs. We find that LLMs' cultural norm interpretation varies\nbased on the persona used and the norm interpretation also varies within a\nsociodemographic category (e.g., a fat person and a thin person as in physical\nappearance group) where an LLM with the more socially desirable persona (e.g.,\na thin person) interprets social norms more accurately than with the less\nsocially desirable persona (e.g., a fat person). We also discuss how different\ntypes of social biases may contribute to the results that we observe.\n","authors":["Mahammed Kamruzzaman","Hieu Nguyen","Nazmul Hassan","Gene Louis Kim"],"pdf_url":"https://arxiv.org/pdf/2409.11636v1.pdf","comment":"Preprint, Under Review"},{"id":"http://arxiv.org/abs/2409.05385v3","updated":"2024-09-18T01:39:02Z","published":"2024-09-09T07:32:30Z","title":"Towards Building a Robust Knowledge Intensive Question Answering Model\n with Large Language Models","summary":" The development of LLMs has greatly enhanced the intelligence and fluency of\nquestion answering, while the emergence of retrieval enhancement has enabled\nmodels to better utilize external information. However, the presence of noise\nand errors in retrieved information poses challenges to the robustness of LLMs.\nIn this work, to evaluate the model's performance under multiple interferences,\nwe first construct a dataset based on machine reading comprehension datasets\nsimulating various scenarios, including critical information absence, noise,\nand conflicts. To address the issue of model accuracy decline caused by noisy\nexternal information, we propose a data augmentation-based fine-tuning method\nto enhance LLM's robustness against noise. Additionally, contrastive learning\napproach is utilized to preserve the model's discrimination capability of\nexternal information. We have conducted experiments on both existing LLMs and\nour approach, the results are evaluated by GPT-4, which indicates that our\nproposed methods improve model robustness while strengthening the model's\ndiscrimination capability.\n","authors":["Xingyun Hong","Yan Shao","Zhilin Wang","Manni Duan","Jin Xiongnan"],"pdf_url":"https://arxiv.org/pdf/2409.05385v3.pdf","comment":"This paper has been accepted by NLPCC-2024"},{"id":"http://arxiv.org/abs/2402.06264v3","updated":"2024-09-18T00:59:58Z","published":"2024-02-09T09:25:18Z","title":"LLaVA-Docent: Instruction Tuning with Multimodal Large Language Model to\n Support Art Appreciation Education","summary":" Despite the development of various AI systems to support learning in various\ndomains, AI assistance for art appreciation education has not been extensively\nexplored. Art appreciation, often perceived as an unfamiliar and challenging\nendeavor for most students, can be more accessible with a generative AI enabled\nconversation partner that provides tailored questions and encourages the\naudience to deeply appreciate artwork. This study explores the application of\nmultimodal large language models (MLLMs) in art appreciation education, with a\nfocus on developing LLaVA-Docent, a model designed to serve as a personal tutor\nfor art appreciation. Our approach involved design and development research,\nfocusing on iterative enhancement to design and develop the application to\nproduce a functional MLLM-enabled chatbot along with a data design framework\nfor art appreciation education. To that end, we established a virtual dialogue\ndataset that was generated by GPT-4, which was instrumental in training our\nMLLM, LLaVA-Docent. The performance of LLaVA-Docent was evaluated by\nbenchmarking it against alternative settings and revealed its distinct\nstrengths and weaknesses. Our findings highlight the efficacy of the MMLM-based\npersonalized art appreciation chatbot and demonstrate its applicability for a\nnovel approach in which art appreciation is taught and experienced.\n","authors":["Unggi Lee","Minji Jeon","Yunseo Lee","Gyuri Byun","Yoorim Son","Jaeyoon Shin","Hongkyu Ko","Hyeoncheol Kim"],"pdf_url":"https://arxiv.org/pdf/2402.06264v3.pdf","comment":"37 pages, 4 figures, 10 tables"},{"id":"http://arxiv.org/abs/2311.10777v6","updated":"2024-09-18T00:16:27Z","published":"2023-11-16T06:01:47Z","title":"A Systematic Review of Aspect-based Sentiment Analysis: Domains,\n Methods, and Trends","summary":" Aspect-based sentiment analysis (ABSA) is a fine-grained type of sentiment\nanalysis that identifies aspects and their associated opinions from a given\ntext. With the surge of digital opinionated text data, ABSA gained increasing\npopularity for its ability to mine more detailed and targeted insights. Many\nreview papers on ABSA subtasks and solution methodologies exist, however, few\nfocus on trends over time or systemic issues relating to research application\ndomains, datasets, and solution approaches. To fill the gap, this paper\npresents a systematic literature review (SLR) of ABSA studies with a focus on\ntrends and high-level relationships among these fundamental components. This\nreview is one of the largest SLRs on ABSA. To our knowledge, it is also the\nfirst to systematically examine the interrelations among ABSA research and data\ndistribution across domains, as well as trends in solution paradigms and\napproaches. Our sample includes 727 primary studies screened from 8550 search\nresults without time constraints via an innovative automatic filtering process.\nOur quantitative analysis not only identifies trends in nearly two decades of\nABSA research development but also unveils a systemic lack of dataset and\ndomain diversity as well as domain mismatch that may hinder the development of\nfuture ABSA research. We discuss these findings and their implications and\npropose suggestions for future research.\n","authors":["Yan Cathy Hua","Paul Denny","Katerina Taskova","Jörg Wicker"],"pdf_url":"https://arxiv.org/pdf/2311.10777v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.05632v3","updated":"2024-09-18T00:02:02Z","published":"2024-01-11T03:04:38Z","title":"Natural Language Processing for Dialects of a Language: A Survey","summary":" State-of-the-art natural language processing (NLP) models are trained on\nmassive training corpora, and report a superlative performance on evaluation\ndatasets. This survey delves into an important attribute of these datasets: the\ndialect of a language. Motivated by the performance degradation of NLP models\nfor dialectic datasets and its implications for the equity of language\ntechnologies, we survey past research in NLP for dialects in terms of datasets,\nand approaches. We describe a wide range of NLP tasks in terms of two\ncategories: natural language understanding (NLU) (for tasks such as dialect\nclassification, sentiment analysis, parsing, and NLU benchmarks) and natural\nlanguage generation (NLG) (for summarisation, machine translation, and dialogue\nsystems). The survey is also broad in its coverage of languages which include\nEnglish, Arabic, German among others. We observe that past work in NLP\nconcerning dialects goes deeper than mere dialect classification, and . This\nincludes early approaches that used sentence transduction that lead to the\nrecent approaches that integrate hypernetworks into LoRA. We expect that this\nsurvey will be useful to NLP researchers interested in building equitable\nlanguage technologies by rethinking LLM benchmarks and model architectures.\n","authors":["Aditya Joshi","Raj Dabre","Diptesh Kanojia","Zhuang Li","Haolan Zhan","Gholamreza Haffari","Doris Dippold"],"pdf_url":"https://arxiv.org/pdf/2401.05632v3.pdf","comment":"The paper is under review at ACM Computing Surveys. Please reach out\n to the authors in the case of feedback"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.12193v1","updated":"2024-09-18T17:59:44Z","published":"2024-09-18T17:59:44Z","title":"Vista3D: Unravel the 3D Darkside of a Single Image","summary":" We embark on the age-old quest: unveiling the hidden dimensions of objects\nfrom mere glimpses of their visible parts. To address this, we present Vista3D,\na framework that realizes swift and consistent 3D generation within a mere 5\nminutes. At the heart of Vista3D lies a two-phase approach: the coarse phase\nand the fine phase. In the coarse phase, we rapidly generate initial geometry\nwith Gaussian Splatting from a single image. In the fine phase, we extract a\nSigned Distance Function (SDF) directly from learned Gaussian Splatting,\noptimizing it with a differentiable isosurface representation. Furthermore, it\nelevates the quality of generation by using a disentangled representation with\ntwo independent implicit functions to capture both visible and obscured aspects\nof objects. Additionally, it harmonizes gradients from 2D diffusion prior with\n3D-aware diffusion priors by angular diffusion prior composition. Through\nextensive evaluation, we demonstrate that Vista3D effectively sustains a\nbalance between the consistency and diversity of the generated 3D objects.\nDemos and code will be available at https://github.com/florinshen/Vista3D.\n","authors":["Qiuhong Shen","Xingyi Yang","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.12193v1.pdf","comment":"ECCV'2024"},{"id":"http://arxiv.org/abs/2409.12192v1","updated":"2024-09-18T17:59:43Z","published":"2024-09-18T17:59:43Z","title":"DynaMo: In-Domain Dynamics Pretraining for Visuo-Motor Control","summary":" Imitation learning has proven to be a powerful tool for training complex\nvisuomotor policies. However, current methods often require hundreds to\nthousands of expert demonstrations to handle high-dimensional visual\nobservations. A key reason for this poor data efficiency is that visual\nrepresentations are predominantly either pretrained on out-of-domain data or\ntrained directly through a behavior cloning objective. In this work, we present\nDynaMo, a new in-domain, self-supervised method for learning visual\nrepresentations. Given a set of expert demonstrations, we jointly learn a\nlatent inverse dynamics model and a forward dynamics model over a sequence of\nimage embeddings, predicting the next frame in latent space, without\naugmentations, contrastive sampling, or access to ground truth actions.\nImportantly, DynaMo does not require any out-of-domain data such as Internet\ndatasets or cross-embodied datasets. On a suite of six simulated and real\nenvironments, we show that representations learned with DynaMo significantly\nimprove downstream imitation learning performance over prior self-supervised\nlearning objectives, and pretrained representations. Gains from using DynaMo\nhold across policy classes such as Behavior Transformer, Diffusion Policy, MLP,\nand nearest neighbors. Finally, we ablate over key components of DynaMo and\nmeasure its impact on downstream policy performance. Robot videos are best\nviewed at https://dynamo-ssl.github.io\n","authors":["Zichen Jeff Cui","Hengkai Pan","Aadhithya Iyer","Siddhant Haldar","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2409.12192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12191v1","updated":"2024-09-18T17:59:32Z","published":"2024-09-18T17:59:32Z","title":"Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at\n Any Resolution","summary":" We present the Qwen2-VL Series, an advanced upgrade of the previous Qwen-VL\nmodels that redefines the conventional predetermined-resolution approach in\nvisual processing. Qwen2-VL introduces the Naive Dynamic Resolution mechanism,\nwhich enables the model to dynamically process images of varying resolutions\ninto different numbers of visual tokens. This approach allows the model to\ngenerate more efficient and accurate visual representations, closely aligning\nwith human perceptual processes. The model also integrates Multimodal Rotary\nPosition Embedding (M-RoPE), facilitating the effective fusion of positional\ninformation across text, images, and videos. We employ a unified paradigm for\nprocessing both images and videos, enhancing the model's visual perception\ncapabilities. To explore the potential of large multimodal models, Qwen2-VL\ninvestigates the scaling laws for large vision-language models (LVLMs). By\nscaling both the model size-with versions at 2B, 8B, and 72B parameters-and the\namount of training data, the Qwen2-VL Series achieves highly competitive\nperformance. Notably, the Qwen2-VL-72B model achieves results comparable to\nleading models such as GPT-4o and Claude3.5-Sonnet across various multimodal\nbenchmarks, outperforming other generalist models. Code is available at\n\\url{https://github.com/QwenLM/Qwen2-VL}.\n","authors":["Peng Wang","Shuai Bai","Sinan Tan","Shijie Wang","Zhihao Fan","Jinze Bai","Keqin Chen","Xuejing Liu","Jialin Wang","Wenbin Ge","Yang Fan","Kai Dang","Mengfei Du","Xuancheng Ren","Rui Men","Dayiheng Liu","Chang Zhou","Jingren Zhou","Junyang Lin"],"pdf_url":"https://arxiv.org/pdf/2409.12191v1.pdf","comment":"Code is available at https://github.com/QwenLM/Qwen2-VL"},{"id":"http://arxiv.org/abs/2409.12190v1","updated":"2024-09-18T17:59:29Z","published":"2024-09-18T17:59:29Z","title":"Bundle Adjustment in the Eager Mode","summary":" Bundle adjustment (BA) is a critical technique in various robotic\napplications, such as simultaneous localization and mapping (SLAM), augmented\nreality (AR), and photogrammetry. BA optimizes parameters such as camera poses\nand 3D landmarks to align them with observations. With the growing importance\nof deep learning in perception systems, there is an increasing need to\nintegrate BA with deep learning frameworks for enhanced reliability and\nperformance. However, widely-used C++-based BA frameworks, such as GTSAM,\ng$^2$o, and Ceres, lack native integration with modern deep learning libraries\nlike PyTorch. This limitation affects their flexibility, adaptability, ease of\ndebugging, and overall implementation efficiency. To address this gap, we\nintroduce an eager-mode BA framework seamlessly integrated with PyPose,\nproviding PyTorch-compatible interfaces with high efficiency. Our approach\nincludes GPU-accelerated, differentiable, and sparse operations designed for\n2nd-order optimization, Lie group and Lie algebra operations, and linear\nsolvers. Our eager-mode BA on GPU demonstrates substantial runtime efficiency,\nachieving an average speedup of 18.5$\\times$, 22$\\times$, and 23$\\times$\ncompared to GTSAM, g$^2$o, and Ceres, respectively.\n","authors":["Zitong Zhan","Huan Xu","Zihang Fang","Xinpeng Wei","Yaoyu Hu","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2409.12190v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12189v1","updated":"2024-09-18T17:58:51Z","published":"2024-09-18T17:58:51Z","title":"Massively Multi-Person 3D Human Motion Forecasting with Scene Context","summary":" Forecasting long-term 3D human motion is challenging: the stochasticity of\nhuman behavior makes it hard to generate realistic human motion from the input\nsequence alone. Information on the scene environment and the motion of nearby\npeople can greatly aid the generation process. We propose a scene-aware social\ntransformer model (SAST) to forecast long-term (10s) human motion motion.\nUnlike previous models, our approach can model interactions between both widely\nvarying numbers of people and objects in a scene. We combine a temporal\nconvolutional encoder-decoder architecture with a Transformer-based bottleneck\nthat allows us to efficiently combine motion and scene information. We model\nthe conditional motion distribution using denoising diffusion models. We\nbenchmark our approach on the Humans in Kitchens dataset, which contains 1 to\n16 persons and 29 to 50 objects that are visible simultaneously. Our model\noutperforms other approaches in terms of realism and diversity on different\nmetrics and in a user study. Code is available at\nhttps://github.com/felixbmuller/SAST.\n","authors":["Felix B Mueller","Julian Tanke","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2409.12189v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.12167v1","updated":"2024-09-18T17:35:19Z","published":"2024-09-18T17:35:19Z","title":"multiPI-TransBTS: A Multi-Path Learning Framework for Brain Tumor Image\n Segmentation Based on Multi-Physical Information","summary":" Brain Tumor Segmentation (BraTS) plays a critical role in clinical diagnosis,\ntreatment planning, and monitoring the progression of brain tumors. However,\ndue to the variability in tumor appearance, size, and intensity across\ndifferent MRI modalities, automated segmentation remains a challenging task. In\nthis study, we propose a novel Transformer-based framework, multiPI-TransBTS,\nwhich integrates multi-physical information to enhance segmentation accuracy.\nThe model leverages spatial information, semantic information, and multi-modal\nimaging data, addressing the inherent heterogeneity in brain tumor\ncharacteristics. The multiPI-TransBTS framework consists of an encoder, an\nAdaptive Feature Fusion (AFF) module, and a multi-source, multi-scale feature\ndecoder. The encoder incorporates a multi-branch architecture to separately\nextract modality-specific features from different MRI sequences. The AFF module\nfuses information from multiple sources using channel-wise and element-wise\nattention, ensuring effective feature recalibration. The decoder combines both\ncommon and task-specific features through a Task-Specific Feature Introduction\n(TSFI) strategy, producing accurate segmentation outputs for Whole Tumor (WT),\nTumor Core (TC), and Enhancing Tumor (ET) regions. Comprehensive evaluations on\nthe BraTS2019 and BraTS2020 datasets demonstrate the superiority of\nmultiPI-TransBTS over the state-of-the-art methods. The model consistently\nachieves better Dice coefficients, Hausdorff distances, and Sensitivity scores,\nhighlighting its effectiveness in addressing the BraTS challenges. Our results\nalso indicate the need for further exploration of the balance between precision\nand recall in the ET segmentation task. The proposed framework represents a\nsignificant advancement in BraTS, with potential implications for improving\nclinical outcomes for brain tumor patients.\n","authors":["Hongjun Zhu","Jiaohang Huang","Kuo Chen","Xuehui Ying","Ying Qian"],"pdf_url":"https://arxiv.org/pdf/2409.12167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02762v2","updated":"2024-09-18T17:28:24Z","published":"2024-05-04T21:55:33Z","title":"TK-Planes: Tiered K-Planes with High Dimensional Feature Vectors for\n Dynamic UAV-based Scenes","summary":" In this paper, we present a new approach to bridge the domain gap between\nsynthetic and real-world data for unmanned aerial vehicle (UAV)-based\nperception. Our formulation is designed for dynamic scenes, consisting of small\nmoving objects or human actions. We propose an extension of K-Planes Neural\nRadiance Field (NeRF), wherein our algorithm stores a set of tiered feature\nvectors. The tiered feature vectors are generated to effectively model\nconceptual information about a scene as well as an image decoder that\ntransforms output feature maps into RGB images. Our technique leverages the\ninformation amongst both static and dynamic objects within a scene and is able\nto capture salient scene attributes of high altitude videos. We evaluate its\nperformance on challenging datasets, including Okutama Action and UG2, and\nobserve considerable improvement in accuracy over state of the art neural\nrendering methods.\n","authors":["Christopher Maxey","Jaehoon Choi","Yonghan Lee","Hyungtae Lee","Dinesh Manocha","Heesung Kwon"],"pdf_url":"https://arxiv.org/pdf/2405.02762v2.pdf","comment":"8 pages, submitted to ICRA2025"},{"id":"http://arxiv.org/abs/2409.12162v1","updated":"2024-09-18T17:25:42Z","published":"2024-09-18T17:25:42Z","title":"Precise Forecasting of Sky Images Using Spatial Warping","summary":" The intermittency of solar power, due to occlusion from cloud cover, is one\nof the key factors inhibiting its widespread use in both commercial and\nresidential settings. Hence, real-time forecasting of solar irradiance for\ngrid-connected photovoltaic systems is necessary to schedule and allocate\nresources across the grid. Ground-based imagers that capture wide field-of-view\nimages of the sky are commonly used to monitor cloud movement around a\nparticular site in an effort to forecast solar irradiance. However, these wide\nFOV imagers capture a distorted image of sky image, where regions near the\nhorizon are heavily compressed. This hinders the ability to precisely predict\ncloud motion near the horizon which especially affects prediction over longer\ntime horizons. In this work, we combat the aforementioned constraint by\nintroducing a deep learning method to predict a future sky image frame with\nhigher resolution than previous methods. Our main contribution is to derive an\noptimal warping method to counter the adverse affects of clouds at the horizon,\nand learn a framework for future sky image prediction which better determines\ncloud evolution for longer time horizons.\n","authors":["Leron Julian","Aswin C. Sankaranarayanan"],"pdf_url":"https://arxiv.org/pdf/2409.12162v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12156v1","updated":"2024-09-18T17:18:13Z","published":"2024-09-18T17:18:13Z","title":"JEAN: Joint Expression and Audio-guided NeRF-based Talking Face\n Generation","summary":" We introduce a novel method for joint expression and audio-guided talking\nface generation. Recent approaches either struggle to preserve the speaker\nidentity or fail to produce faithful facial expressions. To address these\nchallenges, we propose a NeRF-based network. Since we train our network on\nmonocular videos without any ground truth, it is essential to learn\ndisentangled representations for audio and expression. We first learn audio\nfeatures in a self-supervised manner, given utterances from multiple subjects.\nBy incorporating a contrastive learning technique, we ensure that the learned\naudio features are aligned to the lip motion and disentangled from the muscle\nmotion of the rest of the face. We then devise a transformer-based architecture\nthat learns expression features, capturing long-range facial expressions and\ndisentangling them from the speech-specific mouth movements. Through\nquantitative and qualitative evaluation, we demonstrate that our method can\nsynthesize high-fidelity talking face videos, achieving state-of-the-art facial\nexpression transfer along with lip synchronization to unseen audio.\n","authors":["Sai Tanmay Reddy Chakkera","Aggelina Chatziagapi","Dimitris Samaras"],"pdf_url":"https://arxiv.org/pdf/2409.12156v1.pdf","comment":"Accepted by BMVC 2024. Project Page:\n https://starc52.github.io/publications/2024-07-19-JEAN"},{"id":"http://arxiv.org/abs/2409.12155v1","updated":"2024-09-18T17:16:57Z","published":"2024-09-18T17:16:57Z","title":"Autopet III challenge: Incorporating anatomical knowledge into nnUNet\n for lesion segmentation in PET/CT","summary":" Lesion segmentation in PET/CT imaging is essential for precise tumor\ncharacterization, which supports personalized treatment planning and enhances\ndiagnostic precision in oncology. However, accurate manual segmentation of\nlesions is time-consuming and prone to inter-observer variability. Given the\nrising demand and clinical use of PET/CT, automated segmentation methods,\nparticularly deep-learning-based approaches, have become increasingly more\nrelevant. The autoPET III Challenge focuses on advancing automated segmentation\nof tumor lesions in PET/CT images in a multitracer multicenter setting,\naddressing the clinical need for quantitative, robust, and generalizable\nsolutions. Building on previous challenges, the third iteration of the autoPET\nchallenge introduces a more diverse dataset featuring two different tracers\n(FDG and PSMA) from two clinical centers. To this extent, we developed a\nclassifier that identifies the tracer of the given PET/CT based on the Maximum\nIntensity Projection of the PET scan. We trained two individual\nnnUNet-ensembles for each tracer where anatomical labels are included as a\nmulti-label task to enhance the model's performance. Our final submission\nachieves cross-validation Dice scores of 76.90% and 61.33% for the publicly\navailable FDG and PSMA datasets, respectively. The code is available at\nhttps://github.com/hakal104/autoPETIII/ .\n","authors":["Hamza Kalisch","Fabian Hörst","Ken Herrmann","Jens Kleesiek","Constantin Seibold"],"pdf_url":"https://arxiv.org/pdf/2409.12155v1.pdf","comment":"AutoPET III challenge submission"},{"id":"http://arxiv.org/abs/2409.12140v1","updated":"2024-09-18T17:03:30Z","published":"2024-09-18T17:03:30Z","title":"MoRAG -- Multi-Fusion Retrieval Augmented Generation for Human Motion","summary":" We introduce MoRAG, a novel multi-part fusion based retrieval-augmented\ngeneration strategy for text-based human motion generation. The method enhances\nmotion diffusion models by leveraging additional knowledge obtained through an\nimproved motion retrieval process. By effectively prompting large language\nmodels (LLMs), we address spelling errors and rephrasing issues in motion\nretrieval. Our approach utilizes a multi-part retrieval strategy to improve the\ngeneralizability of motion retrieval across the language space. We create\ndiverse samples through the spatial composition of the retrieved motions.\nFurthermore, by utilizing low-level, part-specific motion information, we can\nconstruct motion samples for unseen text descriptions. Our experiments\ndemonstrate that our framework can serve as a plug-and-play module, improving\nthe performance of motion diffusion models. Code, pretrained models and sample\nvideos will be made available at: https://motion-rag.github.io/\n","authors":["Kalakonda Sai Shashank","Shubh Maheshwari","Ravi Kiran Sarvadevabhatla"],"pdf_url":"https://arxiv.org/pdf/2409.12140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12124v1","updated":"2024-09-18T16:46:36Z","published":"2024-09-18T16:46:36Z","title":"Optimal Visual Search with Highly Heuristic Decision Rules","summary":" Visual search is a fundamental natural task for humans and other animals. We\ninvestigated the decision processes humans use when searching briefly presented\ndisplays having well-separated potential target-object locations. Performance\nwas compared with the Bayesian-optimal decision process under the assumption\nthat the information from the different potential target locations is\nstatistically independent. Surprisingly, humans performed slightly better than\noptimal, despite humans' substantial loss of sensitivity in the fovea, and the\nimplausibility of the human brain replicating the optimal computations. We show\nthat three factors can quantitatively explain these seemingly paradoxical\nresults. Most importantly, simple and fixed heuristic decision rules reach near\noptimal search performance. Secondly, foveal neglect primarily affects only the\ncentral potential target location. Finally, spatially correlated neural noise\ncauses search performance to exceed that predicted for independent noise. These\nfindings have far-reaching implications for understanding visual search tasks\nand other identification tasks in humans and other animals.\n","authors":["Anqi Zhang","Wilson S. Geisler"],"pdf_url":"https://arxiv.org/pdf/2409.12124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12111v1","updated":"2024-09-18T16:30:49Z","published":"2024-09-18T16:30:49Z","title":"Applications of Knowledge Distillation in Remote Sensing: A Survey","summary":" With the ever-growing complexity of models in the field of remote sensing\n(RS), there is an increasing demand for solutions that balance model accuracy\nwith computational efficiency. Knowledge distillation (KD) has emerged as a\npowerful tool to meet this need, enabling the transfer of knowledge from large,\ncomplex models to smaller, more efficient ones without significant loss in\nperformance. This review article provides an extensive examination of KD and\nits innovative applications in RS. KD, a technique developed to transfer\nknowledge from a complex, often cumbersome model (teacher) to a more compact\nand efficient model (student), has seen significant evolution and application\nacross various domains. Initially, we introduce the fundamental concepts and\nhistorical progression of KD methods. The advantages of employing KD are\nhighlighted, particularly in terms of model compression, enhanced computational\nefficiency, and improved performance, which are pivotal for practical\ndeployments in RS scenarios. The article provides a comprehensive taxonomy of\nKD techniques, where each category is critically analyzed to demonstrate the\nbreadth and depth of the alternative options, and illustrates specific case\nstudies that showcase the practical implementation of KD methods in RS tasks,\nsuch as instance segmentation and object detection. Further, the review\ndiscusses the challenges and limitations of KD in RS, including practical\nconstraints and prospective future directions, providing a comprehensive\noverview for researchers and practitioners in the field of RS. Through this\norganization, the paper not only elucidates the current state of research in KD\nbut also sets the stage for future research opportunities, thereby contributing\nsignificantly to both academic research and real-world applications.\n","authors":["Yassine Himeur","Nour Aburaed","Omar Elharrouss","Iraklis Varlamis","Shadi Atalla","Wathiq Mansoor","Hussain Al Ahmad"],"pdf_url":"https://arxiv.org/pdf/2409.12111v1.pdf","comment":"50 pages, 11 figures and 9 tables"},{"id":"http://arxiv.org/abs/2409.12108v1","updated":"2024-09-18T16:26:56Z","published":"2024-09-18T16:26:56Z","title":"SPRMamba: Surgical Phase Recognition for Endoscopic Submucosal\n Dissection with Mamba","summary":" Endoscopic Submucosal Dissection (ESD) is a minimally invasive procedure\ninitially designed for the treatment of early gastric cancer but is now widely\nused for various gastrointestinal lesions. Computer-assisted Surgery systems\nhave played a crucial role in improving the precision and safety of ESD\nprocedures, however, their effectiveness is limited by the accurate recognition\nof surgical phases. The intricate nature of ESD, with different lesion\ncharacteristics and tissue structures, presents challenges for real-time\nsurgical phase recognition algorithms. Existing surgical phase recognition\nalgorithms struggle to efficiently capture temporal contexts in video-based\nscenarios, leading to insufficient performance. To address these issues, we\npropose SPRMamba, a novel Mamba-based framework for ESD surgical phase\nrecognition. SPRMamba leverages the strengths of Mamba for long-term temporal\nmodeling while introducing the Scaled Residual TranMamba block to enhance the\ncapture of fine-grained details, overcoming the limitations of traditional\ntemporal models like Temporal Convolutional Networks and Transformers.\nMoreover, a Temporal Sample Strategy is introduced to accelerate the\nprocessing, which is essential for real-time phase recognition in clinical\nsettings. Extensive testing on the ESD385 dataset and the cholecystectomy\nCholec80 dataset demonstrates that SPRMamba surpasses existing state-of-the-art\nmethods and exhibits greater robustness across various surgical phase\nrecognition tasks.\n","authors":["Xiangning Zhang","Jinnan Chen","Qingwei Zhang","Chengfeng Zhou","Zhengjie Zhang","Xiaobo Li","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2409.12108v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12099v1","updated":"2024-09-18T16:19:57Z","published":"2024-09-18T16:19:57Z","title":"Brain-Streams: fMRI-to-Image Reconstruction with Multi-modal Guidance","summary":" Understanding how humans process visual information is one of the crucial\nsteps for unraveling the underlying mechanism of brain activity. Recently, this\ncuriosity has motivated the fMRI-to-image reconstruction task; given the fMRI\ndata from visual stimuli, it aims to reconstruct the corresponding visual\nstimuli. Surprisingly, leveraging powerful generative models such as the Latent\nDiffusion Model (LDM) has shown promising results in reconstructing complex\nvisual stimuli such as high-resolution natural images from vision datasets.\nDespite the impressive structural fidelity of these reconstructions, they often\nlack details of small objects, ambiguous shapes, and semantic nuances.\nConsequently, the incorporation of additional semantic knowledge, beyond mere\nvisuals, becomes imperative. In light of this, we exploit how modern LDMs\neffectively incorporate multi-modal guidance (text guidance, visual guidance,\nand image layout) for structurally and semantically plausible image\ngenerations. Specifically, inspired by the two-streams hypothesis suggesting\nthat perceptual and semantic information are processed in different brain\nregions, our framework, Brain-Streams, maps fMRI signals from these brain\nregions to appropriate embeddings. That is, by extracting textual guidance from\nsemantic information regions and visual guidance from perceptual information\nregions, Brain-Streams provides accurate multi-modal guidance to LDMs. We\nvalidate the reconstruction ability of Brain-Streams both quantitatively and\nqualitatively on a real fMRI dataset comprising natural image stimuli and fMRI\ndata.\n","authors":["Jaehoon Joo","Taejin Jeong","Seongjae Hwang"],"pdf_url":"https://arxiv.org/pdf/2409.12099v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.12214v2","updated":"2024-09-18T16:18:29Z","published":"2024-07-16T23:34:55Z","title":"VideoClusterNet: Self-Supervised and Adaptive Face Clustering For Videos","summary":" With the rise of digital media content production, the need for analyzing\nmovies and TV series episodes to locate the main cast of characters precisely\nis gaining importance.Specifically, Video Face Clustering aims to group\ntogether detected video face tracks with common facial identities. This problem\nis very challenging due to the large range of pose, expression, appearance, and\nlighting variations of a given face across video frames. Generic pre-trained\nFace Identification (ID) models fail to adapt well to the video production\ndomain, given its high dynamic range content and also unique cinematic style.\nFurthermore, traditional clustering algorithms depend on hyperparameters\nrequiring individual tuning across datasets. In this paper, we present a novel\nvideo face clustering approach that learns to adapt a generic face ID model to\nnew video face tracks in a fully self-supervised fashion. We also propose a\nparameter-free clustering algorithm that is capable of automatically adapting\nto the finetuned model's embedding space for any input video. Due to the lack\nof comprehensive movie face clustering benchmarks, we also present a\nfirst-of-kind movie dataset: MovieFaceCluster. Our dataset is handpicked by\nfilm industry professionals and contains extremely challenging face ID\nscenarios. Experiments show our method's effectiveness in handling difficult\nmainstream movie scenes on our benchmark dataset and state-of-the-art\nperformance on traditional TV series datasets.\n","authors":["Devesh Walawalkar","Pablo Garrido"],"pdf_url":"https://arxiv.org/pdf/2407.12214v2.pdf","comment":"Accepted at European Conference on Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2312.09387v2","updated":"2024-09-18T16:11:53Z","published":"2023-12-14T23:00:11Z","title":"High-Resolution Maps of Left Atrial Displacements and Strains Estimated\n with 3D Cine MRI using Online Learning Neural Networks","summary":" The functional analysis of the left atrium (LA) is important for evaluating\ncardiac health and understanding diseases like atrial fibrillation. Cine MRI is\nideally placed for the detailed 3D characterization of LA motion and\ndeformation but is lacking appropriate acquisition and analysis tools. Here, we\npropose tools for the Analysis for Left Atrial Displacements and DeformatIons\nusing online learning neural Networks (Aladdin) and present a technical\nfeasibility study on how Aladdin can characterize 3D LA function globally and\nregionally. Aladdin includes an online segmentation and image registration\nnetwork, and a strain calculation pipeline tailored to the LA. We create maps\nof LA Displacement Vector Field (DVF) magnitude and LA principal strain values\nfrom images of 10 healthy volunteers and 8 patients with cardiovascular disease\n(CVD), of which 2 had large left ventricular ejection fraction (LVEF)\nimpairment. We additionally create an atlas of these biomarkers using the data\nfrom the healthy volunteers. Results showed that Aladdin can accurately track\nthe LA wall across the cardiac cycle and characterize its motion and\ndeformation. Global LA function markers assessed with Aladdin agree well with\nestimates from 2D Cine MRI. A more marked active contraction phase was observed\nin the healthy cohort, while the CVD LVEF group showed overall reduced LA\nfunction. Aladdin is uniquely able to identify LA regions with abnormal\ndeformation metrics that may indicate focal pathology. We expect Aladdin to\nhave important clinical applications as it can non-invasively characterize\natrial pathophysiology. All source code and data are available at:\nhttps://github.com/cgalaz01/aladdin_cmr_la.\n","authors":["Christoforos Galazis","Samuel Shepperd","Emma Brouwer","Sandro Queirós","Ebraham Alskaf","Mustafa Anjari","Amedeo Chiribiri","Jack Lee","Anil A. Bharath","Marta Varela"],"pdf_url":"https://arxiv.org/pdf/2312.09387v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12078v1","updated":"2024-09-18T15:53:45Z","published":"2024-09-18T15:53:45Z","title":"Denoising diffusion models for high-resolution microscopy image\n restoration","summary":" Advances in microscopy imaging enable researchers to visualize structures at\nthe nanoscale level thereby unraveling intricate details of biological\norganization. However, challenges such as image noise, photobleaching of\nfluorophores, and low tolerability of biological samples to high light doses\nremain, restricting temporal resolutions and experiment durations. Reduced\nlaser doses enable longer measurements at the cost of lower resolution and\nincreased noise, which hinders accurate downstream analyses. Here we train a\ndenoising diffusion probabilistic model (DDPM) to predict high-resolution\nimages by conditioning the model on low-resolution information. Additionally,\nthe probabilistic aspect of the DDPM allows for repeated generation of images\nthat tend to further increase the signal-to-noise ratio. We show that our model\nachieves a performance that is better or similar to the previously\nbest-performing methods, across four highly diverse datasets. Importantly,\nwhile any of the previous methods show competitive performance for some, but\nnot all datasets, our method consistently achieves high performance across all\nfour data sets, suggesting high generalizability.\n","authors":["Pamela Osuna-Vargas","Maren H. Wehrheim","Lucas Zinz","Johanna Rahm","Ashwin Balakrishnan","Alexandra Kaminer","Mike Heilemann","Matthias Kaschube"],"pdf_url":"https://arxiv.org/pdf/2409.12078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12074v1","updated":"2024-09-18T15:48:05Z","published":"2024-09-18T15:48:05Z","title":"Online Refractive Camera Model Calibration in Visual Inertial Odometry","summary":" This paper presents a general refractive camera model and online\nco-estimation of odometry and the refractive index of unknown media. This\nenables operation in diverse and varying refractive fluids, given only the\ncamera calibration in air. The refractive index is estimated online as a state\nvariable of a monocular visual-inertial odometry framework in an iterative\nformulation using the proposed camera model. The method was verified on data\ncollected using an underwater robot traversing inside a pool. The evaluations\ndemonstrate convergence to the ideal refractive index for water despite\nsignificant perturbations in the initialization. Simultaneously, the approach\nenables on-par visual-inertial odometry performance in refractive media without\nprior knowledge of the refractive index or requirement of medium-specific\ncamera calibration.\n","authors":["Mohit Singh","Kostas Alexis"],"pdf_url":"https://arxiv.org/pdf/2409.12074v1.pdf","comment":"Accepted at the 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024), 8 pages"},{"id":"http://arxiv.org/abs/2409.12072v1","updated":"2024-09-18T15:47:23Z","published":"2024-09-18T15:47:23Z","title":"PAD-FT: A Lightweight Defense for Backdoor Attacks via Data Purification\n and Fine-Tuning","summary":" Backdoor attacks pose a significant threat to deep neural networks,\nparticularly as recent advancements have led to increasingly subtle\nimplantation, making the defense more challenging. Existing defense mechanisms\ntypically rely on an additional clean dataset as a standard reference and\ninvolve retraining an auxiliary model or fine-tuning the entire victim model.\nHowever, these approaches are often computationally expensive and not always\nfeasible in practical applications. In this paper, we propose a novel and\nlightweight defense mechanism, termed PAD-FT, that does not require an\nadditional clean dataset and fine-tunes only a very small part of the model to\ndisinfect the victim model. To achieve this, our approach first introduces a\nsimple data purification process to identify and select the most-likely clean\ndata from the poisoned training dataset. The self-purified clean dataset is\nthen used for activation clipping and fine-tuning only the last classification\nlayer of the victim model. By integrating data purification, activation\nclipping, and classifier fine-tuning, our mechanism PAD-FT demonstrates\nsuperior effectiveness across multiple backdoor attack methods and datasets, as\nconfirmed through extensive experimental evaluation.\n","authors":["Yukai Xu","Yujie Gu","Kouichi Sakurai"],"pdf_url":"https://arxiv.org/pdf/2409.12072v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05225v2","updated":"2024-09-18T15:17:43Z","published":"2024-09-08T21:43:54Z","title":"Comparison of Two Augmentation Methods in Improving Detection Accuracy\n of Hemarthrosis","summary":" With the increase of computing power, machine learning models in medical\nimaging have been introduced to help in rending medical diagnosis and\ninspection, like hemophilia, a rare disorder in which blood cannot clot\nnormally. Often, one of the bottlenecks of detecting hemophilia is the lack of\ndata available to train the algorithm to increase the accuracy. As a possible\nsolution, this research investigated whether introducing augmented data by data\nsynthesis or traditional augmentation techniques can improve model accuracy,\nhelping to diagnose the diseases. To tackle this research, features of\nultrasound images were extracted by the pre-trained VGG-16, and similarities\nwere compared by cosine similarity measure based on extracted features in\ndifferent distributions among real images, synthetic images, and augmentation\nimages (Real vs. Real, Syn vs. Syn, Real vs. Different Batches of Syn, Real vs.\nAugmentation Techniques). Model testing performance was investigated using\nEffientNet-B4 to recognize \"blood\" images with two augmentation methods. In\naddition, a gradient-weighted class activation mapping (Grad-CAM) visualization\nwas used to interpret the unexpected results like loss of accuracy. Synthetic\nand real images do not show high similarity, with a mean similarity score of\n0.4737. Synthetic batch 1 dataset and images by horizontal flip are more\nsimilar to the original images. Classic augmentation techniques and data\nsynthesis can improve model accuracy, and data by traditional augmentation\ntechniques have a better performance than synthetic data. In addition, the\nGrad-CAM heatmap figured out the loss of accuracy is due to a shift in the\ndomain. Overall, this research found that two augmentation methods, data\nsynthesis and traditional augmentation techniques, both can improve accuracy to\na certain extent to help to diagnose rare diseases.\n","authors":["Qianyu Fan"],"pdf_url":"https://arxiv.org/pdf/2409.05225v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12040v1","updated":"2024-09-18T14:59:30Z","published":"2024-09-18T14:59:30Z","title":"SFDA-rPPG: Source-Free Domain Adaptive Remote Physiological Measurement\n with Spatio-Temporal Consistency","summary":" Remote Photoplethysmography (rPPG) is a non-contact method that uses facial\nvideo to predict changes in blood volume, enabling physiological metrics\nmeasurement. Traditional rPPG models often struggle with poor generalization\ncapacity in unseen domains. Current solutions to this problem is to improve its\ngeneralization in the target domain through Domain Generalization (DG) or\nDomain Adaptation (DA). However, both traditional methods require access to\nboth source domain data and target domain data, which cannot be implemented in\nscenarios with limited access to source data, and another issue is the privacy\nof accessing source domain data. In this paper, we propose the first\nSource-free Domain Adaptation benchmark for rPPG measurement (SFDA-rPPG), which\novercomes these limitations by enabling effective domain adaptation without\naccess to source domain data. Our framework incorporates a Three-Branch\nSpatio-Temporal Consistency Network (TSTC-Net) to enhance feature consistency\nacross domains. Furthermore, we propose a new rPPG distribution alignment loss\nbased on the Frequency-domain Wasserstein Distance (FWD), which leverages\noptimal transport to align power spectrum distributions across domains\neffectively and further enforces the alignment of the three branches. Extensive\ncross-domain experiments and ablation studies demonstrate the effectiveness of\nour proposed method in source-free domain adaptation settings. Our findings\nhighlight the significant contribution of the proposed FWD loss for\ndistributional alignment, providing a valuable reference for future research\nand applications. The source code is available at\nhttps://github.com/XieYiping66/SFDA-rPPG\n","authors":["Yiping Xie","Zitong Yu","Bingjie Wu","Weicheng Xie","Linlin Shen"],"pdf_url":"https://arxiv.org/pdf/2409.12040v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19797v3","updated":"2024-09-18T14:56:45Z","published":"2024-03-28T19:25:25Z","title":"Efficient 3D Instance Mapping and Localization with Neural Fields","summary":" We tackle the problem of learning an implicit scene representation for 3D\ninstance segmentation from a sequence of posed RGB images. Towards this, we\nintroduce 3DIML, a novel framework that efficiently learns a neural label field\nwhich can render 3D instance segmentation masks from novel viewpoints. Opposed\nto prior art that optimizes a neural field in a self-supervised manner,\nrequiring complicated training procedures and loss function design, 3DIML\nleverages a two-phase process. The first phase, InstanceMap, takes as input 2D\nsegmentation masks of the image sequence generated by a frontend instance\nsegmentation model, and associates corresponding masks across images to 3D\nlabels. These almost 3D-consistent pseudolabel masks are then used in the\nsecond phase, InstanceLift, to supervise the training of a neural label field,\nwhich interpolates regions missed by InstanceMap and resolves ambiguities.\nAdditionally, we introduce InstanceLoc, which enables near realtime\nlocalization of instance masks given a trained neural label field. We evaluate\n3DIML on sequences from the Replica and ScanNet datasets and demonstrate its\neffectiveness under mild assumptions for the image sequences. We achieve a\nlarge practical speedup over existing implicit scene representation methods\nwith comparable quality, showcasing its potential to facilitate faster and more\neffective 3D scene understanding.\n","authors":["George Tang","Krishna Murthy Jatavallabhula","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2403.19797v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12034v1","updated":"2024-09-18T14:51:36Z","published":"2024-09-18T14:51:36Z","title":"Multi-Sensor Deep Learning for Glacier Mapping","summary":" The more than 200,000 glaciers outside the ice sheets play a crucial role in\nour society by influencing sea-level rise, water resource management, natural\nhazards, biodiversity, and tourism. However, only a fraction of these glaciers\nbenefit from consistent and detailed in-situ observations that allow for\nassessing their status and changes over time. This limitation can, in part, be\novercome by relying on satellite-based Earth Observation techniques.\nSatellite-based glacier mapping applications have historically mainly relied on\nmanual and semi-automatic detection methods, while recently, a fast and notable\ntransition to deep learning techniques has started.\n This chapter reviews how combining multi-sensor remote sensing data and deep\nlearning allows us to better delineate (i.e. map) glaciers and detect their\ntemporal changes. We explain how relying on deep learning multi-sensor\nframeworks to map glaciers benefits from the extensive availability of regional\nand global glacier inventories. We also analyse the rationale behind glacier\nmapping, the benefits of deep learning methodologies, and the inherent\nchallenges in integrating multi-sensor earth observation data with deep\nlearning algorithms.\n While our review aims to provide a broad overview of glacier mapping efforts,\nwe highlight a few setups where deep learning multi-sensor remote sensing\napplications have a considerable potential added value. This includes\napplications for debris-covered and rock glaciers that are visually difficult\nto distinguish from surroundings and for calving glaciers that are in contact\nwith the ocean. These specific cases are illustrated through a series of visual\nimageries, highlighting some significant advantages and challenges when\ndetecting glacier changes, including dealing with seasonal snow cover, changing\ndebris coverage, and distinguishing glacier fronts from the surrounding sea\nice.\n","authors":["Codruţ-Andrei Diaconu","Konrad Heidler","Jonathan L. Bamber","Harry Zekollari"],"pdf_url":"https://arxiv.org/pdf/2409.12034v1.pdf","comment":"This article will be a chapter of the book Deep Learning for\n Multi-Sensor Earth Observation, to be published by Elsevier"},{"id":"http://arxiv.org/abs/2409.12031v1","updated":"2024-09-18T14:48:50Z","published":"2024-09-18T14:48:50Z","title":"PhysMamba: Efficient Remote Physiological Measurement with SlowFast\n Temporal Difference Mamba","summary":" Facial-video based Remote photoplethysmography (rPPG) aims at measuring\nphysiological signals and monitoring heart activity without any contact,\nshowing significant potential in various applications. Previous deep learning\nbased rPPG measurement are primarily based on CNNs and Transformers. However,\nthe limited receptive fields of CNNs restrict their ability to capture\nlong-range spatio-temporal dependencies, while Transformers also struggle with\nmodeling long video sequences with high complexity. Recently, the state space\nmodels (SSMs) represented by Mamba are known for their impressive performance\non capturing long-range dependencies from long sequences. In this paper, we\npropose the PhysMamba, a Mamba-based framework, to efficiently represent\nlong-range physiological dependencies from facial videos. Specifically, we\nintroduce the Temporal Difference Mamba block to first enhance local dynamic\ndifferences and further model the long-range spatio-temporal context. Moreover,\na dual-stream SlowFast architecture is utilized to fuse the multi-scale\ntemporal features. Extensive experiments are conducted on three benchmark\ndatasets to demonstrate the superiority and efficiency of PhysMamba. The codes\nare available at https://github.com/Chaoqi31/PhysMamba\n","authors":["Chaoqi Luo","Yiping Xie","Zitong Yu"],"pdf_url":"https://arxiv.org/pdf/2409.12031v1.pdf","comment":"Accepted by CCBR 2024"},{"id":"http://arxiv.org/abs/2409.12026v1","updated":"2024-09-18T14:36:50Z","published":"2024-09-18T14:36:50Z","title":"On Vision Transformers for Classification Tasks in Side-Scan Sonar\n Imagery","summary":" Side-scan sonar (SSS) imagery presents unique challenges in the\nclassification of man-made objects on the seafloor due to the complex and\nvaried underwater environments. Historically, experts have manually interpreted\nSSS images, relying on conventional machine learning techniques with\nhand-crafted features. While Convolutional Neural Networks (CNNs) significantly\nadvanced automated classification in this domain, they often fall short when\ndealing with diverse seafloor textures, such as rocky or ripple sand bottoms,\nwhere false positive rates may increase. Recently, Vision Transformers (ViTs)\nhave shown potential in addressing these limitations by utilizing a\nself-attention mechanism to capture global information in image patches,\noffering more flexibility in processing spatial hierarchies. This paper\nrigorously compares the performance of ViT models alongside commonly used CNN\narchitectures, such as ResNet and ConvNext, for binary classification tasks in\nSSS imagery. The dataset encompasses diverse geographical seafloor types and is\nbalanced between the presence and absence of man-made objects. ViT-based models\nexhibit superior classification performance across f1-score, precision, recall,\nand accuracy metrics, although at the cost of greater computational resources.\nCNNs, with their inductive biases, demonstrate better computational efficiency,\nmaking them suitable for deployment in resource-constrained environments like\nunderwater vehicles. Future research directions include exploring\nself-supervised learning for ViTs and multi-modal fusion to further enhance\nperformance in challenging underwater environments.\n","authors":["BW Sheffield","Jeffrey Ellen","Ben Whitmore"],"pdf_url":"https://arxiv.org/pdf/2409.12026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08672v3","updated":"2024-09-18T14:35:56Z","published":"2022-11-16T04:59:46Z","title":"Mitigating Urban-Rural Disparities in Contrastive Representation\n Learning with Satellite Imagery","summary":" Satellite imagery is being leveraged for many societally critical tasks\nacross climate, economics, and public health. Yet, because of heterogeneity in\nlandscapes (e.g. how a road looks in different places), models can show\ndisparate performance across geographic areas. Given the important potential of\ndisparities in algorithmic systems used in societal contexts, here we consider\nthe risk of urban-rural disparities in identification of land-cover features.\nThis is via semantic segmentation (a common computer vision task in which image\nregions are labelled according to what is being shown) which uses pre-trained\nimage representations generated via contrastive self-supervised learning. We\npropose fair dense representation with contrastive learning (FairDCL) as a\nmethod for de-biasing the multi-level latent space of convolution neural\nnetwork models. The method improves feature identification by removing spurious\nmodel representations which are disparately distributed across urban and rural\nareas, and is achieved in an unsupervised way by contrastive pre-training. The\nobtained image representation mitigates downstream urban-rural prediction\ndisparities and outperforms state-of-the-art baselines on real-world satellite\nimages. Embedding space evaluation and ablation studies further demonstrate\nFairDCL's robustness. As generalizability and robustness in geographic imagery\nis a nascent topic, our work motivates researchers to consider metrics beyond\naverage accuracy in such applications.\n","authors":["Miao Zhang","Rumi Chunara"],"pdf_url":"https://arxiv.org/pdf/2211.08672v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12024v1","updated":"2024-09-18T14:34:06Z","published":"2024-09-18T14:34:06Z","title":"LEMON: Localized Editing with Mesh Optimization and Neural Shaders","summary":" In practical use cases, polygonal mesh editing can be faster than generating\nnew ones, but it can still be challenging and time-consuming for users.\nExisting solutions for this problem tend to focus on a single task, either\ngeometry or novel view synthesis, which often leads to disjointed results\nbetween the mesh and view. In this work, we propose LEMON, a mesh editing\npipeline that combines neural deferred shading with localized mesh\noptimization. Our approach begins by identifying the most important vertices in\nthe mesh for editing, utilizing a segmentation model to focus on these key\nregions. Given multi-view images of an object, we optimize a neural shader and\na polygonal mesh while extracting the normal map and the rendered image from\neach view. By using these outputs as conditioning data, we edit the input\nimages with a text-to-image diffusion model and iteratively update our dataset\nwhile deforming the mesh. This process results in a polygonal mesh that is\nedited according to the given text instruction, preserving the geometric\ncharacteristics of the initial mesh while focusing on the most significant\nareas. We evaluate our pipeline using the DTU dataset, demonstrating that it\ngenerates finely-edited meshes more rapidly than the current state-of-the-art\nmethods. We include our code and additional results in the supplementary\nmaterial.\n","authors":["Furkan Mert Algan","Umut Yazgan","Driton Salihu","Cem Eteke","Eckehard Steinbach"],"pdf_url":"https://arxiv.org/pdf/2409.12024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12016v1","updated":"2024-09-18T14:29:43Z","published":"2024-09-18T14:29:43Z","title":"Computational Imaging for Long-Term Prediction of Solar Irradiance","summary":" The occlusion of the sun by clouds is one of the primary sources of\nuncertainties in solar power generation, and is a factor that affects the\nwide-spread use of solar power as a primary energy source. Real-time\nforecasting of cloud movement and, as a result, solar irradiance is necessary\nto schedule and allocate energy across grid-connected photovoltaic systems.\nPrevious works monitored cloud movement using wide-angle field of view imagery\nof the sky. However, such images have poor resolution for clouds that appear\nnear the horizon, which reduces their effectiveness for long term prediction of\nsolar occlusion. Specifically, to be able to predict occlusion of the sun over\nlong time periods, clouds that are near the horizon need to be detected, and\ntheir velocities estimated precisely. To enable such a system, we design and\ndeploy a catadioptric system that delivers wide-angle imagery with uniform\nspatial resolution of the sky over its field of view. To enable prediction over\na longer time horizon, we design an algorithm that uses carefully selected\nspatio-temporal slices of the imagery using estimated wind direction and\nvelocity as inputs. Using ray-tracing simulations as well as a real testbed\ndeployed outdoors, we show that the system is capable of predicting solar\nocclusion as well as irradiance for tens of minutes in the future, which is an\norder of magnitude improvement over prior work.\n","authors":["Leron Julian","Haejoon Lee","Soummya Kar","Aswin C. Sankaranarayanan"],"pdf_url":"https://arxiv.org/pdf/2409.12016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.07106v2","updated":"2024-09-18T14:27:41Z","published":"2023-08-14T12:38:43Z","title":"Checklist to Define the Identification of TP, FP, and FN Object\n Detections in Automated Driving","summary":" The object perception of automated driving systems must pass quality and\nrobustness tests before a safe deployment. Such tests typically identify true\npositive (TP), false-positive (FP), and false-negative (FN) detections and\naggregate them to metrics. Since the literature seems to be lacking a\ncomprehensive way to define the identification of TPs/FPs/FNs, this paper\nprovides a checklist of relevant functional aspects and implementation details.\nBesides labeling policies of the test set, we cover areas of vision, occlusion\nhandling, safety-relevant areas, matching criteria, temporal and probabilistic\nissues, and further aspects. Even though the checklist cannot be fully\nformalized, it can help practitioners minimize the ambiguity of their tests,\nwhich, in turn, makes statements on object perception more reliable and\ncomparable.\n","authors":["Michael Hoss"],"pdf_url":"https://arxiv.org/pdf/2308.07106v2.pdf","comment":"This version improves the checklist's usability by providing bullet\n points to follow. It also condenses the contributions to safety assurance\n down to the \"Related Work\" section. 11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.12011v1","updated":"2024-09-18T14:25:02Z","published":"2024-09-18T14:25:02Z","title":"Mixture of Prompt Learning for Vision Language Models","summary":" As powerful pre-trained vision-language models (VLMs) like CLIP gain\nprominence, numerous studies have attempted to combine VLMs for downstream\ntasks. Among these, prompt learning has been validated as an effective method\nfor adapting to new tasks, which only requiring a small number of parameters.\nHowever, current prompt learning methods face two challenges: first, a single\nsoft prompt struggles to capture the diverse styles and patterns within a\ndataset; second, fine-tuning soft prompts is prone to overfitting. To address\nthese challenges, we propose a mixture of soft prompt learning method\nincorporating a routing module. This module is able to capture a dataset's\nvaried styles and dynamically selects the most suitable prompts for each\ninstance. Additionally, we introduce a novel gating mechanism to ensure the\nrouter selects prompts based on their similarity to hard prompt templates,\nwhich both retaining knowledge from hard prompts and improving selection\naccuracy. We also implement semantically grouped text-level supervision,\ninitializing each soft prompt with the token embeddings of manually designed\ntemplates from its group and applied a contrastive loss between the resulted\ntext feature and hard prompt encoded text feature. This supervision ensures\nthat the text features derived from soft prompts remain close to those from\ntheir corresponding hard prompts, preserving initial knowledge and mitigating\noverfitting. Our method has been validated on 11 datasets, demonstrating\nevident improvements in few-shot learning, domain generalization, and\nbase-to-new generalization scenarios compared to existing baselines. The code\nwill be available at \\url{https://anonymous.4open.science/r/mocoop-6387}\n","authors":["Yu Du","Tong Niu","Rong Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.12011v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12010v1","updated":"2024-09-18T14:24:29Z","published":"2024-09-18T14:24:29Z","title":"ChefFusion: Multimodal Foundation Model Integrating Recipe and Food\n Image Generation","summary":" Significant work has been conducted in the domain of food computing, yet\nthese studies typically focus on single tasks such as t2t (instruction\ngeneration from food titles and ingredients), i2t (recipe generation from food\nimages), or t2i (food image generation from recipes). None of these approaches\nintegrate all modalities simultaneously. To address this gap, we introduce a\nnovel food computing foundation model that achieves true multimodality,\nencompassing tasks such as t2t, t2i, i2t, it2t, and t2ti. By leveraging large\nlanguage models (LLMs) and pre-trained image encoder and decoder models, our\nmodel can perform a diverse array of food computing-related tasks, including\nfood understanding, food recognition, recipe generation, and food image\ngeneration. Compared to previous models, our foundation model demonstrates a\nsignificantly broader range of capabilities and exhibits superior performance,\nparticularly in food image generation and recipe generation tasks. We\nopen-sourced ChefFusion at GitHub.\n","authors":["Peiyu Li","Xiaobao Huang","Yijun Tian","Nitesh V. Chawla"],"pdf_url":"https://arxiv.org/pdf/2409.12010v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12008v1","updated":"2024-09-18T14:21:07Z","published":"2024-09-18T14:21:07Z","title":"Panoptic-Depth Forecasting","summary":" Forecasting the semantics and 3D structure of scenes is essential for robots\nto navigate and plan actions safely. Recent methods have explored semantic and\npanoptic scene forecasting; however, they do not consider the geometry of the\nscene. In this work, we propose the panoptic-depth forecasting task for jointly\npredicting the panoptic segmentation and depth maps of unobserved future\nframes, from monocular camera images. To facilitate this work, we extend the\npopular KITTI-360 and Cityscapes benchmarks by computing depth maps from LiDAR\npoint clouds and leveraging sequential labeled data. We also introduce a\nsuitable evaluation metric that quantifies both the panoptic quality and depth\nestimation accuracy of forecasts in a coherent manner. Furthermore, we present\ntwo baselines and propose the novel PDcast architecture that learns rich\nspatio-temporal representations by incorporating a transformer-based encoder, a\nforecasting module, and task-specific decoders to predict future panoptic-depth\noutputs. Extensive evaluations demonstrate the effectiveness of PDcast across\ntwo datasets and three forecasting tasks, consistently addressing the primary\nchallenges. We make the code publicly available at\nhttps://pdcast.cs.uni-freiburg.de.\n","authors":["Juana Valeria Hurtado","Riya Mohan","Abhinav Valada"],"pdf_url":"https://arxiv.org/pdf/2409.12008v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12002v1","updated":"2024-09-18T14:15:10Z","published":"2024-09-18T14:15:10Z","title":"Towards Global Localization using Multi-Modal Object-Instance\n Re-Identification","summary":" Re-identification (ReID) is a critical challenge in computer vision,\npredominantly studied in the context of pedestrians and vehicles. However,\nrobust object-instance ReID, which has significant implications for tasks such\nas autonomous exploration, long-term perception, and scene understanding,\nremains underexplored. In this work, we address this gap by proposing a novel\ndual-path object-instance re-identification transformer architecture that\nintegrates multimodal RGB and depth information. By leveraging depth data, we\ndemonstrate improvements in ReID across scenes that are cluttered or have\nvarying illumination conditions. Additionally, we develop a ReID-based\nlocalization framework that enables accurate camera localization and pose\nidentification across different viewpoints. We validate our methods using two\ncustom-built RGB-D datasets, as well as multiple sequences from the open-source\nTUM RGB-D datasets. Our approach demonstrates significant improvements in both\nobject instance ReID (mAP of 75.18) and localization accuracy (success rate of\n83% on TUM-RGBD), highlighting the essential role of object ReID in advancing\nrobotic perception. Our models, frameworks, and datasets have been made\npublicly available.\n","authors":["Aneesh Chavan","Vaibhav Agrawal","Vineeth Bhat","Sarthak Chittawar","Siddharth Srivastava","Chetan Arora","K Madhava Krishna"],"pdf_url":"https://arxiv.org/pdf/2409.12002v1.pdf","comment":"8 pages, 5 figures, 3 tables. Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2312.11973v5","updated":"2024-09-18T14:02:13Z","published":"2023-12-19T09:11:49Z","title":"Continual Learning: Forget-free Winning Subnetworks for Video\n Representations","summary":" Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the\nexistence of efficient subnetworks within larger, dense networks, a\nhigh-performing Winning Subnetwork (WSN) in terms of task performance under\nappropriate sparsity conditions is considered for various continual learning\ntasks. It leverages pre-existing weights from dense networks to achieve\nefficient learning in Task Incremental Learning (TIL) and Task-agnostic\nIncremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning\n(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is\ndesigned to prevent overfitting when the data samples are scarce. Furthermore,\nthe sparse reuse of WSN weights is considered for Video Incremental Learning\n(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It\nenables compact encoding of videos and identifies reusable subnetworks across\nvarying bandwidths. We have integrated FSO into different architectural\nframeworks for continual learning, including VIL, TIL, and FSCIL. Our\ncomprehensive experiments demonstrate FSO's effectiveness, significantly\nimproving task performance at various convolutional representational levels.\nSpecifically, FSO enhances higher-layer performance in TIL and FSCIL and\nlower-layer performance in VIL.\n","authors":["Haeyong Kang","Jaehong Yoon","Sung Ju Hwang","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2312.11973v5.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.14962,\n arXiv:2306.11305"},{"id":"http://arxiv.org/abs/2407.20784v2","updated":"2024-09-18T14:01:47Z","published":"2024-07-27T15:41:13Z","title":"Inverse Problems with Diffusion Models: A MAP Estimation Perspective","summary":" Inverse problems have many applications in science and engineering. In\nComputer vision, several image restoration tasks such as inpainting,\ndeblurring, and super-resolution can be formally modeled as inverse problems.\nRecently, methods have been developed for solving inverse problems that only\nleverage a pre-trained unconditional diffusion model and do not require\nadditional task-specific training. In such methods, however, the inherent\nintractability of determining the conditional score function during the reverse\ndiffusion process poses a real challenge, leaving the methods to settle with an\napproximation instead, which affects their performance in practice. Here, we\npropose a MAP estimation framework to model the reverse conditional generation\nprocess of a continuous time diffusion model as an optimization process of the\nunderlying MAP objective, whose gradient term is tractable. In theory, the\nproposed framework can be applied to solve general inverse problems using\ngradient-based optimization methods. However, given the highly non-convex\nnature of the loss objective, finding a perfect gradient-based optimization\nalgorithm can be quite challenging, nevertheless, our framework offers several\npotential research directions. We use our proposed formulation to develop\nempirically effective algorithms for image restoration. We validate our\nproposed algorithms with extensive experiments over multiple datasets across\nseveral restoration tasks.\n","authors":["Sai Bharath Chandra Gutha","Ricardo Vinuesa","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2407.20784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.08822v2","updated":"2024-09-18T13:50:46Z","published":"2024-08-16T16:12:44Z","title":"PFDiff: Training-free Acceleration of Diffusion Models through the\n Gradient Guidance of Past and Future","summary":" Diffusion Probabilistic Models (DPMs) have shown remarkable potential in\nimage generation, but their sampling efficiency is hindered by the need for\nnumerous denoising steps. Most existing solutions accelerate the sampling\nprocess by proposing fast ODE solvers. However, the inevitable discretization\nerrors of the ODE solvers are significantly magnified when the number of\nfunction evaluations (NFE) is fewer. In this work, we propose PFDiff, a novel\ntraining-free and orthogonal timestep-skipping strategy, which enables existing\nfast ODE solvers to operate with fewer NFE. Specifically, PFDiff initially\nutilizes gradient replacement from past time steps to predict a \"springboard\".\nSubsequently, it employs this \"springboard\" along with foresight updates\ninspired by Nesterov momentum to rapidly update current intermediate states.\nThis approach effectively reduces unnecessary NFE while correcting for\ndiscretization errors inherent in first-order ODE solvers. Experimental results\ndemonstrate that PFDiff exhibits flexible applicability across various\npre-trained DPMs, particularly excelling in conditional DPMs and surpassing\nprevious state-of-the-art training-free methods. For instance, using DDIM as a\nbaseline, we achieved 16.46 FID (4 NFE) compared to 138.81 FID with DDIM on\nImageNet 64x64 with classifier guidance, and 13.06 FID (10 NFE) on Stable\nDiffusion with 7.5 guidance scale.\n","authors":["Guangyi Wang","Yuren Cai","Lijiang Li","Wei Peng","Songzhi Su"],"pdf_url":"https://arxiv.org/pdf/2408.08822v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11983v1","updated":"2024-09-18T13:40:59Z","published":"2024-09-18T13:40:59Z","title":"Intraoperative Registration by Cross-Modal Inverse Neural Rendering","summary":" We present in this paper a novel approach for 3D/2D intraoperative\nregistration during neurosurgery via cross-modal inverse neural rendering. Our\napproach separates implicit neural representation into two components, handling\nanatomical structure preoperatively and appearance intraoperatively. This\ndisentanglement is achieved by controlling a Neural Radiance Field's appearance\nwith a multi-style hypernetwork. Once trained, the implicit neural\nrepresentation serves as a differentiable rendering engine, which can be used\nto estimate the surgical camera pose by minimizing the dissimilarity between\nits rendered images and the target intraoperative image. We tested our method\non retrospective patients' data from clinical cases, showing that our method\noutperforms state-of-the-art while meeting current clinical standards for\nregistration. Code and additional resources can be found at\nhttps://maxfehrentz.github.io/style-ngp/.\n","authors":["Maximilian Fehrentz","Mohammad Farid Azampour","Reuben Dorent","Hassan Rasheed","Colin Galvin","Alexandra Golby","William M. Wells","Sarah Frisken","Nassir Navab","Nazim Haouchine"],"pdf_url":"https://arxiv.org/pdf/2409.11983v1.pdf","comment":"Accepted at MICCAI 2024"},{"id":"http://arxiv.org/abs/2409.11059v2","updated":"2024-09-18T13:27:39Z","published":"2024-09-17T10:38:46Z","title":"OneEncoder: A Lightweight Framework for Progressive Alignment of\n Modalities","summary":" Cross-modal alignment Learning integrates information from different\nmodalities like text, image, audio and video to create unified models. This\napproach develops shared representations and learns correlations between\nmodalities, enabling applications such as visual question answering and\naudiovisual content analysis. Current techniques rely on large\nmodality-specific encoders, necessitating fine-tuning or training from scratch\non vast aligned datasets (e.g., text-image, text-audio, image-audio). This\napproach has limitations: (i) it is very expensive due to the need for training\nlarge encoders on extensive datasets, (ii) acquiring aligned large paired\ndatasets is challenging, and (iii) adding new modalities requires retraining\nthe entire framework to incorporate these modalities. To address these issues,\nwe propose OneEncoder, a lightweight framework that progressively represents\nand aligns four modalities (image, text, audio, video). Initially, we train a\nlightweight Universal Projection module (UP) to align image and text\nmodalities. Then, we freeze the pretrained UP and progressively align future\nmodalities to those already aligned. OneEncoder operates efficiently and\ncost-effectively, even in scenarios where vast aligned datasets are\nunavailable, due to its lightweight design. Trained on small paired datasets,\nit shows strong performance in tasks like classification, querying, and visual\nquestion answering, surpassing methods that rely on large datasets and\nspecialized encoders.\n","authors":["Bilal Faye","Hanane Azzag","Mustapha Lebbah"],"pdf_url":"https://arxiv.org/pdf/2409.11059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11974v1","updated":"2024-09-18T13:26:45Z","published":"2024-09-18T13:26:45Z","title":"MitoSeg: Mitochondria Segmentation Tool","summary":" Recent studies suggest a potential link between the physical structure of\nmitochondria and neurodegenerative diseases. With advances in Electron\nMicroscopy techniques, it has become possible to visualize the boundary and\ninternal membrane structures of mitochondria in detail. It is crucial to\nautomatically segment mitochondria from these images to investigate the\nrelationship between mitochondria and diseases. In this paper, we present a\nsoftware solution for mitochondrial segmentation, highlighting mitochondria\nboundaries in electron microscopy tomography images and generating\ncorresponding 3D meshes.\n","authors":["Faris Serdar Taşel","Efe Çiftci"],"pdf_url":"https://arxiv.org/pdf/2409.11974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11969v1","updated":"2024-09-18T13:20:46Z","published":"2024-09-18T13:20:46Z","title":"Unveiling the Black Box: Independent Functional Module Evaluation for\n Bird's-Eye-View Perception Model","summary":" End-to-end models are emerging as the mainstream in autonomous driving\nperception. However, the inability to meticulously deconstruct their internal\nmechanisms results in diminished development efficacy and impedes the\nestablishment of trust. Pioneering in the issue, we present the Independent\nFunctional Module Evaluation for Bird's-Eye-View Perception Model (BEV-IFME), a\nnovel framework that juxtaposes the module's feature maps against Ground Truth\nwithin a unified semantic Representation Space to quantify their similarity,\nthereby assessing the training maturity of individual functional modules. The\ncore of the framework lies in the process of feature map encoding and\nrepresentation aligning, facilitated by our proposed two-stage Alignment\nAutoEncoder, which ensures the preservation of salient information and the\nconsistency of feature structure. The metric for evaluating the training\nmaturity of functional modules, Similarity Score, demonstrates a robust\npositive correlation with BEV metrics, with an average correlation coefficient\nof 0.9387, attesting to the framework's reliability for assessment purposes.\n","authors":["Ludan Zhang","Xiaokang Ding","Yuqi Dai","Lei He","Keqiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.11969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11960v1","updated":"2024-09-18T13:11:15Z","published":"2024-09-18T13:11:15Z","title":"A Chinese Continuous Sign Language Dataset Based on Complex Environments","summary":" The current bottleneck in continuous sign language recognition (CSLR)\nresearch lies in the fact that most publicly available datasets are limited to\nlaboratory environments or television program recordings, resulting in a single\nbackground environment with uniform lighting, which significantly deviates from\nthe diversity and complexity found in real-life scenarios. To address this\nchallenge, we have constructed a new, large-scale dataset for Chinese\ncontinuous sign language (CSL) based on complex environments, termed the\ncomplex environment - chinese sign language dataset (CE-CSL). This dataset\nencompasses 5,988 continuous CSL video clips collected from daily life scenes,\nfeaturing more than 70 different complex backgrounds to ensure\nrepresentativeness and generalization capability. To tackle the impact of\ncomplex backgrounds on CSLR performance, we propose a time-frequency network\n(TFNet) model for continuous sign language recognition. This model extracts\nframe-level features and then utilizes both temporal and spectral information\nto separately derive sequence features before fusion, aiming to achieve\nefficient and accurate CSLR. Experimental results demonstrate that our approach\nachieves significant performance improvements on the CE-CSL, validating its\neffectiveness under complex background conditions. Additionally, our proposed\nmethod has also yielded highly competitive results when applied to three\npublicly available CSL datasets.\n","authors":["Qidan Zhu","Jing Li","Fei Yuan","Jiaojiao Fan","Quan Gan"],"pdf_url":"https://arxiv.org/pdf/2409.11960v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.11953v1","updated":"2024-09-18T13:07:19Z","published":"2024-09-18T13:07:19Z","title":"Tracking Any Point with Frame-Event Fusion Network at High Frame Rate","summary":" Tracking any point based on image frames is constrained by frame rates,\nleading to instability in high-speed scenarios and limited generalization in\nreal-world applications. To overcome these limitations, we propose an\nimage-event fusion point tracker, FE-TAP, which combines the contextual\ninformation from image frames with the high temporal resolution of events,\nachieving high frame rate and robust point tracking under various challenging\nconditions. Specifically, we designed an Evolution Fusion module (EvoFusion) to\nmodel the image generation process guided by events. This module can\neffectively integrate valuable information from both modalities operating at\ndifferent frequencies. To achieve smoother point trajectories, we employed a\ntransformer-based refinement strategy that updates the point's trajectories and\nfeatures iteratively. Extensive experiments demonstrate that our method\noutperforms state-of-the-art approaches, particularly improving expected\nfeature age by 24$\\%$ on EDS datasets. Finally, we qualitatively validated the\nrobustness of our algorithm in real driving scenarios using our custom-designed\nhigh-resolution image-event synchronization device. Our source code will be\nreleased at https://github.com/ljx1002/FE-TAP.\n","authors":["Jiaxiong Liu","Bo Wang","Zhen Tan","Jinpu Zhang","Hui Shen","Dewen Hu"],"pdf_url":"https://arxiv.org/pdf/2409.11953v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11951v1","updated":"2024-09-18T13:05:43Z","published":"2024-09-18T13:05:43Z","title":"GaussianHeads: End-to-End Learning of Drivable Gaussian Head Avatars\n from Coarse-to-fine Representations","summary":" Real-time rendering of human head avatars is a cornerstone of many computer\ngraphics applications, such as augmented reality, video games, and films, to\nname a few. Recent approaches address this challenge with computationally\nefficient geometry primitives in a carefully calibrated multi-view setup.\nAlbeit producing photorealistic head renderings, it often fails to represent\ncomplex motion changes such as the mouth interior and strongly varying head\nposes. We propose a new method to generate highly dynamic and deformable human\nhead avatars from multi-view imagery in real-time. At the core of our method is\na hierarchical representation of head models that allows to capture the complex\ndynamics of facial expressions and head movements. First, with rich facial\nfeatures extracted from raw input frames, we learn to deform the coarse facial\ngeometry of the template mesh. We then initialize 3D Gaussians on the deformed\nsurface and refine their positions in a fine step. We train this coarse-to-fine\nfacial avatar model along with the head pose as a learnable parameter in an\nend-to-end framework. This enables not only controllable facial animation via\nvideo inputs, but also high-fidelity novel view synthesis of challenging facial\nexpressions, such as tongue deformations and fine-grained teeth structure under\nlarge motion changes. Moreover, it encourages the learned head avatar to\ngeneralize towards new facial expressions and head poses at inference time. We\ndemonstrate the performance of our method with comparisons against the related\nmethods on different datasets, spanning challenging facial expression sequences\nacross multiple identities. We also show the potential application of our\napproach by demonstrating a cross-identity facial performance transfer\napplication.\n","authors":["Kartik Teotia","Hyeongwoo Kim","Pablo Garrido","Marc Habermann","Mohamed Elgharib","Christian Theobalt"],"pdf_url":"https://arxiv.org/pdf/2409.11951v1.pdf","comment":"ACM Transaction on Graphics (SIGGRAPH Asia 2024); Project page:\n https://vcai.mpi-inf.mpg.de/projects/GaussianHeads/"},{"id":"http://arxiv.org/abs/2409.11937v1","updated":"2024-09-18T12:52:54Z","published":"2024-09-18T12:52:54Z","title":"Differentiable Collision-Supervised Tooth Arrangement Network with a\n Decoupling Perspective","summary":" Tooth arrangement is an essential step in the digital orthodontic planning\nprocess. Existing learning-based methods use hidden teeth features to directly\nregress teeth motions, which couples target pose perception and motion\nregression. It could lead to poor perceptions of three-dimensional\ntransformation. They also ignore the possible overlaps or gaps between teeth of\npredicted dentition, which is generally unacceptable. Therefore, we propose\nDTAN, a differentiable collision-supervised tooth arrangement network,\ndecoupling predicting tasks and feature modeling. DTAN decouples the tooth\narrangement task by first predicting the hidden features of the final teeth\nposes and then using them to assist in regressing the motions between the\nbeginning and target teeth. To learn the hidden features better, DTAN also\ndecouples the teeth-hidden features into geometric and positional features,\nwhich are further supervised by feature consistency constraints. Furthermore,\nwe propose a novel differentiable collision loss function for point cloud data\nto constrain the related gestures between teeth, which can be easily extended\nto other 3D point cloud tasks. We propose an arch-width guided tooth\narrangement network, named C-DTAN, to make the results controllable. We\nconstruct three different tooth arrangement datasets and achieve drastically\nimproved performance on accuracy and speed compared with existing methods.\n","authors":["Zhihui He","Chengyuan Wang","Shidong Yang","Li Chen","Yanheng Zhou","Shuo Wang"],"pdf_url":"https://arxiv.org/pdf/2409.11937v1.pdf","comment":"16 pages, 13 figures"},{"id":"http://arxiv.org/abs/2311.14922v2","updated":"2024-09-18T12:39:06Z","published":"2023-11-25T03:55:06Z","title":"GDTS: Goal-Guided Diffusion Model with Tree Sampling for Multi-Modal\n Pedestrian Trajectory Prediction","summary":" Accurate prediction of pedestrian trajectories is crucial for improving the\nsafety of autonomous driving. However, this task is generally nontrivial due to\nthe inherent stochasticity of human motion, which naturally requires the\npredictor to generate multi-modal prediction. Previous works leverage various\ngenerative methods, such as GAN and VAE, for pedestrian trajectory prediction.\nNevertheless, these methods may suffer from mode collapse and relatively\nlow-quality results. The denoising diffusion probabilistic model (DDPM) has\nrecently been applied to trajectory prediction due to its simple training\nprocess and powerful reconstruction ability. However, current diffusion-based\nmethods do not fully utilize input information and usually require many\ndenoising iterations that lead to a long inference time or an additional\nnetwork for initialization. To address these challenges and facilitate the use\nof diffusion models in multi-modal trajectory prediction, we propose GDTS, a\nnovel Goal-Guided Diffusion Model with Tree Sampling for multi-modal trajectory\nprediction. Considering the \"goal-driven\" characteristics of human motion, GDTS\nleverages goal estimation to guide the generation of the diffusion network. A\ntwo-stage tree sampling algorithm is presented, which leverages common features\nto reduce the inference time and improve accuracy for multi-modal prediction.\nExperimental results demonstrate that our proposed framework achieves\ncomparable state-of-the-art performance with real-time inference speed in\npublic datasets.\n","authors":["Ge Sun","Sheng Wang","Lei Zhu","Ming Liu","Jun Ma"],"pdf_url":"https://arxiv.org/pdf/2311.14922v2.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2409.11923v1","updated":"2024-09-18T12:37:58Z","published":"2024-09-18T12:37:58Z","title":"Agglomerative Token Clustering","summary":" We present Agglomerative Token Clustering (ATC), a novel token merging method\nthat consistently outperforms previous token merging and pruning methods across\nimage classification, image synthesis, and object detection & segmentation\ntasks. ATC merges clusters through bottom-up hierarchical clustering, without\nthe introduction of extra learnable parameters. We find that ATC achieves\nstate-of-the-art performance across all tasks, and can even perform on par with\nprior state-of-the-art when applied off-the-shelf, i.e. without fine-tuning.\nATC is particularly effective when applied with low keep rates, where only a\nsmall fraction of tokens are kept and retaining task performance is especially\ndifficult.\n","authors":["Joakim Bruslund Haurum","Sergio Escalera","Graham W. Taylor","Thomas B. Moeslund"],"pdf_url":"https://arxiv.org/pdf/2409.11923v1.pdf","comment":"ECCV 2024. Project webpage at https://vap.aau.dk/atc/"},{"id":"http://arxiv.org/abs/2409.11920v1","updated":"2024-09-18T12:32:39Z","published":"2024-09-18T12:32:39Z","title":"Generation of Complex 3D Human Motion by Temporal and Spatial\n Composition of Diffusion Models","summary":" In this paper, we address the challenge of generating realistic 3D human\nmotions for action classes that were never seen during the training phase. Our\napproach involves decomposing complex actions into simpler movements,\nspecifically those observed during training, by leveraging the knowledge of\nhuman motion contained in GPTs models. These simpler movements are then\ncombined into a single, realistic animation using the properties of diffusion\nmodels. Our claim is that this decomposition and subsequent recombination of\nsimple movements can synthesize an animation that accurately represents the\ncomplex input action. This method operates during the inference phase and can\nbe integrated with any pre-trained diffusion model, enabling the synthesis of\nmotion classes not present in the training data. We evaluate our method by\ndividing two benchmark human motion datasets into basic and complex actions,\nand then compare its performance against the state-of-the-art.\n","authors":["Lorenzo Mandelli","Stefano Berretti"],"pdf_url":"https://arxiv.org/pdf/2409.11920v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.11919v1","updated":"2024-09-18T12:32:25Z","published":"2024-09-18T12:32:25Z","title":"LLM-wrapper: Black-Box Semantic-Aware Adaptation of Vision-Language\n Foundation Models","summary":" Vision Language Models (VLMs) have shown impressive performances on numerous\ntasks but their zero-shot capabilities can be limited compared to dedicated or\nfine-tuned models. Yet, fine-tuning VLMs comes with limitations as it requires\n`white-box' access to the model's architecture and weights as well as expertise\nto design the fine-tuning objectives and optimize the hyper-parameters, which\nare specific to each VLM and downstream task. In this work, we propose\nLLM-wrapper, a novel approach to adapt VLMs in a `black-box' manner by\nleveraging large language models (LLMs) so as to reason on their outputs. We\ndemonstrate the effectiveness of LLM-wrapper on Referring Expression\nComprehension (REC), a challenging open-vocabulary task that requires spatial\nand semantic reasoning. Our approach significantly boosts the performance of\noff-the-shelf models, resulting in competitive results when compared with\nclassic fine-tuning.\n","authors":["Amaia Cardiel","Eloi Zablocki","Oriane Siméoni","Elias Ramzi","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2409.11919v1.pdf","comment":"EVAL-FoMo workshop, ECCV 2024"},{"id":"http://arxiv.org/abs/2409.08513v3","updated":"2024-09-18T12:30:30Z","published":"2024-09-13T03:23:52Z","title":"Mamba-YOLO-World: Marrying YOLO-World with Mamba for Open-Vocabulary\n Detection","summary":" Open-vocabulary detection (OVD) aims to detect objects beyond a predefined\nset of categories. As a pioneering model incorporating the YOLO series into\nOVD, YOLO-World is well-suited for scenarios prioritizing speed and efficiency.\nHowever, its performance is hindered by its neck feature fusion mechanism,\nwhich causes the quadratic complexity and the limited guided receptive fields.\nTo address these limitations, we present Mamba-YOLO-World, a novel YOLO-based\nOVD model employing the proposed MambaFusion Path Aggregation Network\n(MambaFusion-PAN) as its neck architecture. Specifically, we introduce an\ninnovative State Space Model-based feature fusion mechanism consisting of a\nParallel-Guided Selective Scan algorithm and a Serial-Guided Selective Scan\nalgorithm with linear complexity and globally guided receptive fields. It\nleverages multi-modal input sequences and mamba hidden states to guide the\nselective scanning process. Experiments demonstrate that our model outperforms\nthe original YOLO-World on the COCO and LVIS benchmarks in both zero-shot and\nfine-tuning settings while maintaining comparable parameters and FLOPs.\nAdditionally, it surpasses existing state-of-the-art OVD methods with fewer\nparameters and FLOPs.\n","authors":["Haoxuan Wang","Qingdong He","Jinlong Peng","Hao Yang","Mingmin Chi","Yabiao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.08513v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.03456v2","updated":"2024-09-18T12:12:41Z","published":"2024-09-05T12:09:02Z","title":"LM-Gaussian: Boost Sparse-view 3D Gaussian Splatting with Large Model\n Priors","summary":" We aim to address sparse-view reconstruction of a 3D scene by leveraging\npriors from large-scale vision models. While recent advancements such as 3D\nGaussian Splatting (3DGS) have demonstrated remarkable successes in 3D\nreconstruction, these methods typically necessitate hundreds of input images\nthat densely capture the underlying scene, making them time-consuming and\nimpractical for real-world applications. However, sparse-view reconstruction is\ninherently ill-posed and under-constrained, often resulting in inferior and\nincomplete outcomes. This is due to issues such as failed initialization,\noverfitting on input images, and a lack of details. To mitigate these\nchallenges, we introduce LM-Gaussian, a method capable of generating\nhigh-quality reconstructions from a limited number of images. Specifically, we\npropose a robust initialization module that leverages stereo priors to aid in\nthe recovery of camera poses and the reliable point clouds. Additionally, a\ndiffusion-based refinement is iteratively applied to incorporate image\ndiffusion priors into the Gaussian optimization process to preserve intricate\nscene details. Finally, we utilize video diffusion priors to further enhance\nthe rendered images for realistic visual effects. Overall, our approach\nsignificantly reduces the data acquisition requirements compared to previous\n3DGS methods. We validate the effectiveness of our framework through\nexperiments on various public datasets, demonstrating its potential for\nhigh-quality 360-degree scene reconstruction. Visual results are on our\nwebsite.\n","authors":["Hanyang Yu","Xiaoxiao Long","Ping Tan"],"pdf_url":"https://arxiv.org/pdf/2409.03456v2.pdf","comment":"Project page: https://hanyangyu1021.github.io/lm-gaussian.github.io/"},{"id":"http://arxiv.org/abs/2409.11910v1","updated":"2024-09-18T12:11:59Z","published":"2024-09-18T12:11:59Z","title":"Tumor aware recurrent inter-patient deformable image registration of\n computed tomography scans with lung cancer","summary":" Background: Voxel-based analysis (VBA) for population level radiotherapy (RT)\noutcomes modeling requires topology preserving inter-patient deformable image\nregistration (DIR) that preserves tumors on moving images while avoiding\nunrealistic deformations due to tumors occurring on fixed images. Purpose: We\ndeveloped a tumor-aware recurrent registration (TRACER) deep learning (DL)\nmethod and evaluated its suitability for VBA. Methods: TRACER consists of\nencoder layers implemented with stacked 3D convolutional long short term memory\nnetwork (3D-CLSTM) followed by decoder and spatial transform layers to compute\ndense deformation vector field (DVF). Multiple CLSTM steps are used to compute\na progressive sequence of deformations. Input conditioning was applied by\nincluding tumor segmentations with 3D image pairs as input channels.\nBidirectional tumor rigidity, image similarity, and deformation smoothness\nlosses were used to optimize the network in an unsupervised manner. TRACER and\nmultiple DL methods were trained with 204 3D CT image pairs from patients with\nlung cancers (LC) and evaluated using (a) Dataset I (N = 308 pairs) with DL\nsegmented LCs, (b) Dataset II (N = 765 pairs) with manually delineated LCs, and\n(c) Dataset III with 42 LC patients treated with RT. Results: TRACER accurately\naligned normal tissues. It best preserved tumors, blackindicated by the\nsmallest tumor volume difference of 0.24\\%, 0.40\\%, and 0.13 \\% and mean square\nerror in CT intensities of 0.005, 0.005, 0.004, computed between original and\nresampled moving image tumors, for Datasets I, II, and III, respectively. It\nresulted in the smallest planned RT tumor dose difference computed between\noriginal and resampled moving images of 0.01 Gy and 0.013 Gy when using a\nfemale and a male reference.\n","authors":["Jue Jiang","Chloe Min Seo Choi","Maria Thor","Joseph O. Deasy","Harini Veeraraghavan"],"pdf_url":"https://arxiv.org/pdf/2409.11910v1.pdf","comment":"Minor revision under the journal of Medical Physics"},{"id":"http://arxiv.org/abs/2409.11904v1","updated":"2024-09-18T12:02:20Z","published":"2024-09-18T12:02:20Z","title":"Finding the Subjective Truth: Collecting 2 Million Votes for\n Comprehensive Gen-AI Model Evaluation","summary":" Efficiently evaluating the performance of text-to-image models is difficult\nas it inherently requires subjective judgment and human preference, making it\nhard to compare different models and quantify the state of the art. Leveraging\nRapidata's technology, we present an efficient annotation framework that\nsources human feedback from a diverse, global pool of annotators. Our study\ncollected over 2 million annotations across 4,512 images, evaluating four\nprominent models (DALL-E 3, Flux.1, MidJourney, and Stable Diffusion) on style\npreference, coherence, and text-to-image alignment. We demonstrate that our\napproach makes it feasible to comprehensively rank image generation models\nbased on a vast pool of annotators and show that the diverse annotator\ndemographics reflect the world population, significantly decreasing the risk of\nbiases.\n","authors":["Dimitrios Christodoulou","Mads Kuhlmann-Jørgensen"],"pdf_url":"https://arxiv.org/pdf/2409.11904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11622v3","updated":"2024-09-18T12:00:56Z","published":"2023-10-17T23:20:36Z","title":"High-Resolution Building and Road Detection from Sentinel-2","summary":" Mapping buildings and roads automatically with remote sensing typically\nrequires high-resolution imagery, which is expensive to obtain and often\nsparsely available. In this work we demonstrate how multiple 10 m resolution\nSentinel-2 images can be used to generate 50 cm resolution building and road\nsegmentation masks. This is done by training a `student' model with access to\nSentinel-2 images to reproduce the predictions of a `teacher' model which has\naccess to corresponding high-resolution imagery. While the predictions do not\nhave all the fine detail of the teacher model, we find that we are able to\nretain much of the performance: for building segmentation we achieve 79.0\\%\nmIoU, compared to the high-resolution teacher model accuracy of 85.5\\% mIoU. We\nalso describe two related methods that work on Sentinel-2 imagery: one for\ncounting individual buildings which achieves $R^2 = 0.91$ against true counts\nand one for predicting building height with 1.5 meter mean absolute error. This\nwork opens up new possibilities for using freely available Sentinel-2 imagery\nfor a range of tasks that previously could only be done with high-resolution\nsatellite imagery.\n","authors":["Wojciech Sirko","Emmanuel Asiedu Brempong","Juliana T. C. Marcos","Abigail Annkah","Abel Korme","Mohammed Alewi Hassen","Krishna Sapkota","Tomer Shekel","Abdoulaye Diack","Sella Nevo","Jason Hickey","John Quinn"],"pdf_url":"https://arxiv.org/pdf/2310.11622v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13579v3","updated":"2024-09-18T11:12:39Z","published":"2024-04-21T08:37:43Z","title":"LTOS: Layout-controllable Text-Object Synthesis via Adaptive\n Cross-attention Fusions","summary":" Controllable text-to-image generation synthesizes visual text and objects in\nimages with certain conditions, which are frequently applied to emoji and\nposter generation. Visual text rendering and layout-to-image generation tasks\nhave been popular in controllable text-to-image generation. However, each of\nthese tasks typically focuses on single modality generation or rendering,\nleaving yet-to-be-bridged gaps between the approaches correspondingly designed\nfor each of the tasks. In this paper, we combine text rendering and\nlayout-to-image generation tasks into a single task: layout-controllable\ntext-object synthesis (LTOS) task, aiming at synthesizing images with object\nand visual text based on predefined object layout and text contents. As\ncompliant datasets are not readily available for our LTOS task, we construct a\nlayout-aware text-object synthesis dataset, containing elaborate well-aligned\nlabels of visual text and object information. Based on the dataset, we propose\na layout-controllable text-object adaptive fusion (TOF) framework, which\ngenerates images with clear, legible visual text and plausible objects. We\nconstruct a visual-text rendering module to synthesize text and employ an\nobject-layout control module to generate objects while integrating the two\nmodules to harmoniously generate and integrate text content and objects in\nimages. To better the image-text integration, we propose a self-adaptive\ncross-attention fusion module that helps the image generation to attend more to\nimportant text information. Within such a fusion module, we use a self-adaptive\nlearnable factor to learn to flexibly control the influence of cross-attention\noutputs on image generation. Experimental results show that our method\noutperforms the state-of-the-art in LTOS, text rendering, and layout-to-image\ntasks, enabling harmonious visual text rendering and object generation.\n","authors":["Xiaoran Zhao","Tianhao Wu","Yu Lai","Zhiliang Tian","Zhen Huang","Yahui Liu","Zejiang He","Dongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2404.13579v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11874v1","updated":"2024-09-18T11:04:35Z","published":"2024-09-18T11:04:35Z","title":"ABHINAW: A method for Automatic Evaluation of Typography within\n AI-Generated Images","summary":" In the fast-evolving field of Generative AI, platforms like MidJourney,\nDALL-E, and Stable Diffusion have transformed Text-to-Image (T2I) Generation.\nHowever, despite their impressive ability to create high-quality images, they\noften struggle to generate accurate text within these images. Theoretically, if\nwe could achieve accurate text generation in AI images in a ``zero-shot''\nmanner, it would not only make AI-generated images more meaningful but also\ndemocratize the graphic design industry. The first step towards this goal is to\ncreate a robust scoring matrix for evaluating text accuracy in AI-generated\nimages. Although there are existing bench-marking methods like CLIP SCORE and\nT2I-CompBench++, there's still a gap in systematically evaluating text and\ntypography in AI-generated images, especially with diffusion-based methods. In\nthis paper, we introduce a novel evaluation matrix designed explicitly for\nquantifying the performance of text and typography generation within\nAI-generated images. We have used letter by letter matching strategy to compute\nthe exact matching scores from the reference text to the AI generated text. Our\nnovel approach to calculate the score takes care of multiple redundancies such\nas repetition of words, case sensitivity, mixing of words, irregular\nincorporation of letters etc. Moreover, we have developed a Novel method named\nas brevity adjustment to handle excess text. In addition we have also done a\nquantitative analysis of frequent errors arise due to frequently used words and\nless frequently used words. Project page is available at:\nhttps://github.com/Abhinaw3906/ABHINAW-MATRIX.\n","authors":["Abhinaw Jagtap","Nachiket Tapas","R. G. Brajesh"],"pdf_url":"https://arxiv.org/pdf/2409.11874v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11869v1","updated":"2024-09-18T10:52:02Z","published":"2024-09-18T10:52:02Z","title":"SpheriGait: Enriching Spatial Representation via Spherical Projection\n for LiDAR-based Gait Recognition","summary":" Gait recognition is a rapidly progressing technique for the remote\nidentification of individuals. Prior research predominantly employing 2D\nsensors to gather gait data has achieved notable advancements; nonetheless,\nthey have unavoidably neglected the influence of 3D dynamic characteristics on\nrecognition. Gait recognition utilizing LiDAR 3D point clouds not only directly\ncaptures 3D spatial features but also diminishes the impact of lighting\nconditions while ensuring privacy protection.The essence of the problem lies in\nhow to effectively extract discriminative 3D dynamic representation from point\nclouds.In this paper, we proposes a method named SpheriGait for extracting and\nenhancing dynamic features from point clouds for Lidar-based gait recognition.\nSpecifically, it substitutes the conventional point cloud plane projection\nmethod with spherical projection to augment the perception of dynamic\nfeature.Additionally, a network block named DAM-L is proposed to extract gait\ncues from the projected point cloud data. We conducted extensive experiments\nand the results demonstrated the SpheriGait achieved state-of-the-art\nperformance on the SUSTech1K dataset, and verified that the spherical\nprojection method can serve as a universal data preprocessing technique to\nenhance the performance of other LiDAR-based gait recognition methods,\nexhibiting exceptional flexibility and practicality.\n","authors":["Yanxi Wang","Zhigang Chang","Chen Wu","Zihao Cheng","Hongmin Gao"],"pdf_url":"https://arxiv.org/pdf/2409.11869v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.19140v2","updated":"2024-09-18T10:50:32Z","published":"2024-03-28T04:24:56Z","title":"QNCD: Quantization Noise Correction for Diffusion Models","summary":" Diffusion models have revolutionized image synthesis, setting new benchmarks\nin quality and creativity. However, their widespread adoption is hindered by\nthe intensive computation required during the iterative denoising process.\nPost-training quantization (PTQ) presents a solution to accelerate sampling,\naibeit at the expense of sample quality, extremely in low-bit settings.\nAddressing this, our study introduces a unified Quantization Noise Correction\nScheme (QNCD), aimed at minishing quantization noise throughout the sampling\nprocess. We identify two primary quantization challenges: intra and inter\nquantization noise. Intra quantization noise, mainly exacerbated by embeddings\nin the resblock module, extends activation quantization ranges, increasing\ndisturbances in each single denosing step. Besides, inter quantization noise\nstems from cumulative quantization deviations across the entire denoising\nprocess, altering data distributions step-by-step. QNCD combats these through\nembedding-derived feature smoothing for eliminating intra quantization noise\nand an effective runtime noise estimatiation module for dynamicly filtering\ninter quantization noise. Extensive experiments demonstrate that our method\noutperforms previous quantization methods for diffusion models, achieving\nlossless results in W4A8 and W8A8 quantization settings on ImageNet (LDM-4).\nCode is available at: https://github.com/huanpengchu/QNCD\n","authors":["Huanpeng Chu","Wei Wu","Chengjie Zang","Kun Yuan"],"pdf_url":"https://arxiv.org/pdf/2403.19140v2.pdf","comment":"Accepted by ACMMM2024"},{"id":"http://arxiv.org/abs/2409.11867v1","updated":"2024-09-18T10:48:10Z","published":"2024-09-18T10:48:10Z","title":"Distillation-free Scaling of Large SSMs for Images and Videos","summary":" State-space models (SSMs), exemplified by S4, have introduced a novel context\nmodeling method by integrating state-space techniques into deep learning.\nHowever, they struggle with global context modeling due to their\ndata-independent matrices. The Mamba model addressed this with data-dependent\nvariants via the S6 selective-scan algorithm, enhancing context modeling,\nespecially for long sequences. However, Mamba-based architectures are difficult\nto scale with respect to the number of parameters, which is a major limitation\nfor vision applications. This paper addresses the scalability issue of large\nSSMs for image classification and action recognition without requiring\nadditional techniques like knowledge distillation. We analyze the distinct\ncharacteristics of Mamba-based and Attention-based models, proposing a\nMamba-Attention interleaved architecture that enhances scalability, robustness,\nand performance. We demonstrate that the stable and efficient interleaved\narchitecture resolves the scalability issue of Mamba-based architectures for\nimages and videos and increases robustness to common artifacts like JPEG\ncompression. Our thorough evaluation on the ImageNet-1K, Kinetics-400 and\nSomething-Something-v2 benchmarks demonstrates that our approach improves the\naccuracy of state-of-the-art Mamba-based architectures by up to $+1.7$.\n","authors":["Hamid Suleman","Syed Talal Wasim","Muzammal Naseer","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2409.11867v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11854v1","updated":"2024-09-18T10:22:07Z","published":"2024-09-18T10:22:07Z","title":"Physically-Based Photometric Bundle Adjustment in Non-Lambertian\n Environments","summary":" Photometric bundle adjustment (PBA) is widely used in estimating the camera\npose and 3D geometry by assuming a Lambertian world. However, the assumption of\nphotometric consistency is often violated since the non-diffuse reflection is\ncommon in real-world environments. The photometric inconsistency significantly\naffects the reliability of existing PBA methods. To solve this problem, we\npropose a novel physically-based PBA method. Specifically, we introduce the\nphysically-based weights regarding material, illumination, and light path.\nThese weights distinguish the pixel pairs with different levels of photometric\ninconsistency. We also design corresponding models for material estimation\nbased on sequential images and illumination estimation based on point clouds.\nIn addition, we establish the first SLAM-related dataset of non-Lambertian\nscenes with complete ground truth of illumination and material. Extensive\nexperiments demonstrated that our PBA method outperforms existing approaches in\naccuracy.\n","authors":["Lei Cheng","Junpeng Hu","Haodong Yan","Mariia Gladkova","Tianyu Huang","Yun-Hui Liu","Daniel Cremers","Haoang Li"],"pdf_url":"https://arxiv.org/pdf/2409.11854v1.pdf","comment":"Accepted to 2024 IEEE/RSJ International Conference on Intelligent\n Robots and Systems (IROS 2024)"},{"id":"http://arxiv.org/abs/2409.11206v2","updated":"2024-09-18T09:57:28Z","published":"2024-09-17T14:00:58Z","title":"High-Order Evolving Graphs for Enhanced Representation of Traffic\n Dynamics","summary":" We present an innovative framework for traffic dynamics analysis using\nHigh-Order Evolving Graphs, designed to improve spatio-temporal representations\nin autonomous driving contexts. Our approach constructs temporal bidirectional\nbipartite graphs that effectively model the complex interactions within traffic\nscenes in real-time. By integrating Graph Neural Networks (GNNs) with\nhigh-order multi-aggregation strategies, we significantly enhance the modeling\nof traffic scene dynamics, providing a more accurate and detailed analysis of\nthese interactions. Additionally, we incorporate inductive learning techniques\ninspired by the GraphSAGE framework, enabling our model to adapt to new and\nunseen traffic scenarios without the need for retraining, thus ensuring robust\ngeneralization. Through extensive experiments on the ROAD and ROAD Waymo\ndatasets, we establish a comprehensive baseline for further developments,\ndemonstrating the potential of our method in accurately capturing traffic\nbehavior. Our results emphasize the value of high-order statistical moments and\nfeature-gated attention mechanisms in improving traffic behavior analysis,\nlaying the groundwork for advancing autonomous driving technologies. Our source\ncode is available at: https://github.com/Addy-1998/High_Order_Graphs\n","authors":["Aditya Humnabadkar","Arindam Sikdar","Benjamin Cave","Huaizhong Zhang","Paul Bakaki","Ardhendu Behera"],"pdf_url":"https://arxiv.org/pdf/2409.11206v2.pdf","comment":"Accepted manuscript - 2nd Workshop on Vision-Centric Autonomous\n Driving (VCAD) as part of European Conference on Computer Vision (ECCV) 2024"},{"id":"http://arxiv.org/abs/2409.11836v1","updated":"2024-09-18T09:38:08Z","published":"2024-09-18T09:38:08Z","title":"NT-ViT: Neural Transcoding Vision Transformers for EEG-to-fMRI Synthesis","summary":" This paper introduces the Neural Transcoding Vision Transformer (\\modelname),\na generative model designed to estimate high-resolution functional Magnetic\nResonance Imaging (fMRI) samples from simultaneous Electroencephalography (EEG)\ndata. A key feature of \\modelname is its Domain Matching (DM) sub-module which\neffectively aligns the latent EEG representations with those of fMRI volumes,\nenhancing the model's accuracy and reliability. Unlike previous methods that\ntend to struggle with fidelity and reproducibility of images, \\modelname\naddresses these challenges by ensuring methodological integrity and\nhigher-quality reconstructions which we showcase through extensive evaluation\non two benchmark datasets; \\modelname outperforms the current state-of-the-art\nby a significant margin in both cases, e.g. achieving a $10\\times$ reduction in\nRMSE and a $3.14\\times$ increase in SSIM on the Oddball dataset. An ablation\nstudy also provides insights into the contribution of each component to the\nmodel's overall effectiveness. This development is critical in offering a new\napproach to lessen the time and financial constraints typically linked with\nhigh-resolution brain imaging, thereby aiding in the swift and precise\ndiagnosis of neurological disorders. Although it is not a replacement for\nactual fMRI but rather a step towards making such imaging more accessible, we\nbelieve that it represents a pivotal advancement in clinical practice and\nneuroscience research. Code is available at\n\\url{https://github.com/rom42pla/ntvit}.\n","authors":["Romeo Lanzino","Federico Fontana","Luigi Cinque","Francesco Scarcello","Atsuto Maki"],"pdf_url":"https://arxiv.org/pdf/2409.11836v1.pdf","comment":"ECCV24 Workshop on Synthetic Data for Computer Vision"},{"id":"http://arxiv.org/abs/2409.11172v2","updated":"2024-09-18T09:35:15Z","published":"2024-09-17T13:26:17Z","title":"Annealed Winner-Takes-All for Motion Forecasting","summary":" In autonomous driving, motion prediction aims at forecasting the future\ntrajectories of nearby agents, helping the ego vehicle to anticipate behaviors\nand drive safely. A key challenge is generating a diverse set of future\npredictions, commonly addressed using data-driven models with Multiple Choice\nLearning (MCL) architectures and Winner-Takes-All (WTA) training objectives.\nHowever, these methods face initialization sensitivity and training\ninstabilities. Additionally, to compensate for limited performance, some\napproaches rely on training with a large set of hypotheses, requiring a\npost-selection step during inference to significantly reduce the number of\npredictions. To tackle these issues, we take inspiration from annealed MCL, a\nrecently introduced technique that improves the convergence properties of MCL\nmethods through an annealed Winner-Takes-All loss (aWTA). In this paper, we\ndemonstrate how the aWTA loss can be integrated with state-of-the-art motion\nforecasting models to enhance their performance using only a minimal set of\nhypotheses, eliminating the need for the cumbersome post-selection step. Our\napproach can be easily incorporated into any trajectory prediction model\nnormally trained using WTA and yields significant improvements. To facilitate\nthe application of our approach to future motion forecasting models, the code\nwill be made publicly available upon acceptance:\nhttps://github.com/valeoai/MF_aWTA.\n","authors":["Yihong Xu","Victor Letzelter","Mickaël Chen","Éloi Zablocki","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2409.11172v2.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.11831v1","updated":"2024-09-18T09:30:03Z","published":"2024-09-18T09:30:03Z","title":"RaggeDi: Diffusion-based State Estimation of Disordered Rags, Sheets,\n Towels and Blankets","summary":" Cloth state estimation is an important problem in robotics. It is essential\nfor the robot to know the accurate state to manipulate cloth and execute tasks\nsuch as robotic dressing, stitching, and covering/uncovering human beings.\nHowever, estimating cloth state accurately remains challenging due to its high\nflexibility and self-occlusion. This paper proposes a diffusion model-based\npipeline that formulates the cloth state estimation as an image generation\nproblem by representing the cloth state as an RGB image that describes the\npoint-wise translation (translation map) between a pre-defined flattened mesh\nand the deformed mesh in a canonical space. Then we train a conditional\ndiffusion-based image generation model to predict the translation map based on\nan observation. Experiments are conducted in both simulation and the real world\nto validate the performance of our method. Results indicate that our method\noutperforms two recent methods in both accuracy and speed.\n","authors":["Jikai Ye","Wanze Li","Shiraz Khan","Gregory S. Chirikjian"],"pdf_url":"https://arxiv.org/pdf/2409.11831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11819v1","updated":"2024-09-18T09:11:31Z","published":"2024-09-18T09:11:31Z","title":"End-to-End Probabilistic Geometry-Guided Regression for 6DoF Object Pose\n Estimation","summary":" 6D object pose estimation is the problem of identifying the position and\norientation of an object relative to a chosen coordinate system, which is a\ncore technology for modern XR applications. State-of-the-art 6D object pose\nestimators directly predict an object pose given an object observation. Due to\nthe ill-posed nature of the pose estimation problem, where multiple different\nposes can correspond to a single observation, generating additional plausible\nestimates per observation can be valuable. To address this, we reformulate the\nstate-of-the-art algorithm GDRNPP and introduce EPRO-GDR (End-to-End\nProbabilistic Geometry-Guided Regression). Instead of predicting a single pose\nper detection, we estimate a probability density distribution of the pose.\nUsing the evaluation procedure defined by the BOP (Benchmark for 6D Object Pose\nEstimation) Challenge, we test our approach on four of its core datasets and\ndemonstrate superior quantitative results for EPRO-GDR on LM-O, YCB-V, and\nITODD. Our probabilistic solution shows that predicting a pose distribution\ninstead of a single pose can improve state-of-the-art single-view pose\nestimation while providing the additional benefit of being able to sample\nmultiple meaningful pose candidates.\n","authors":["Thomas Pöllabauer","Jiayin Li","Volker Knauthe","Sarah Berkei","Arjan Kuijper"],"pdf_url":"https://arxiv.org/pdf/2409.11819v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11817v1","updated":"2024-09-18T09:08:16Z","published":"2024-09-18T09:08:16Z","title":"EFCM: Efficient Fine-tuning on Compressed Models for deployment of large\n models in medical image analysis","summary":" The recent development of deep learning large models in medicine shows\nremarkable performance in medical image analysis and diagnosis, but their large\nnumber of parameters causes memory and inference latency challenges. Knowledge\ndistillation offers a solution, but the slide-level gradients cannot be\nbackpropagated for student model updates due to high-resolution pathological\nimages and slide-level labels. This study presents an Efficient Fine-tuning on\nCompressed Models (EFCM) framework with two stages: unsupervised feature\ndistillation and fine-tuning. In the distillation stage, Feature Projection\nDistillation (FPD) is proposed with a TransScan module for adaptive receptive\nfield adjustment to enhance the knowledge absorption capability of the student\nmodel. In the slide-level fine-tuning stage, three strategies (Reuse CLAM,\nRetrain CLAM, and End2end Train CLAM (ETC)) are compared. Experiments are\nconducted on 11 downstream datasets related to three large medical models:\nRETFound for retina, MRM for chest X-ray, and BROW for histopathology. The\nexperimental results demonstrate that the EFCM framework significantly improves\naccuracy and efficiency in handling slide-level pathological image problems,\neffectively addressing the challenges of deploying large medical models.\nSpecifically, it achieves a 4.33% increase in ACC and a 5.2% increase in AUC\ncompared to the large model BROW on the TCGA-NSCLC and TCGA-BRCA datasets. The\nanalysis of model inference efficiency highlights the high efficiency of the\ndistillation fine-tuning method.\n","authors":["Shaojie Li","Zhaoshuo Diao"],"pdf_url":"https://arxiv.org/pdf/2409.11817v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11816v1","updated":"2024-09-18T09:06:55Z","published":"2024-09-18T09:06:55Z","title":"SymFace: Additional Facial Symmetry Loss for Deep Face Recognition","summary":" Over the past decade, there has been a steady advancement in enhancing face\nrecognition algorithms leveraging advanced machine learning methods. The role\nof the loss function is pivotal in addressing face verification problems and\nplaying a game-changing role. These loss functions have mainly explored\nvariations among intra-class or inter-class separation. This research examines\nthe natural phenomenon of facial symmetry in the face verification problem. The\nsymmetry between the left and right hemi faces has been widely used in many\nresearch areas in recent decades. This paper adopts this simple approach\njudiciously by splitting the face image vertically into two halves. With the\nassumption that the natural phenomena of facial symmetry can enhance face\nverification methodology, we hypothesize that the two output embedding vectors\nof split faces must project close to each other in the output embedding space.\nInspired by this concept, we penalize the network based on the disparity of\nembedding of the symmetrical pair of split faces. Symmetrical loss has the\npotential to minimize minor asymmetric features due to facial expression and\nlightning conditions, hence significantly increasing the inter-class variance\namong the classes and leading to more reliable face embedding. This loss\nfunction propels any network to outperform its baseline performance across all\nexisting network architectures and configurations, enabling us to achieve SoTA\nresults.\n","authors":["Pritesh Prakash","Koteswar Rao Jerripothula","Ashish Jacob Sam","Prinsh Kumar Singh","S Umamaheswaran"],"pdf_url":"https://arxiv.org/pdf/2409.11816v1.pdf","comment":"11 Pages, 6 Figures, 5 Tables, Submitted for WACV 2025"},{"id":"http://arxiv.org/abs/2409.11813v1","updated":"2024-09-18T09:01:34Z","published":"2024-09-18T09:01:34Z","title":"EventAug: Multifaceted Spatio-Temporal Data Augmentation Methods for\n Event-based Learning","summary":" The event camera has demonstrated significant success across a wide range of\nareas due to its low time latency and high dynamic range. However, the\ncommunity faces challenges such as data deficiency and limited diversity, often\nresulting in over-fitting and inadequate feature learning. Notably, the\nexploration of data augmentation techniques in the event community remains\nscarce. This work aims to address this gap by introducing a systematic\naugmentation scheme named EventAug to enrich spatial-temporal diversity. In\nparticular, we first propose Multi-scale Temporal Integration (MSTI) to\ndiversify the motion speed of objects, then introduce Spatial-salient Event\nMask (SSEM) and Temporal-salient Event Mask (TSEM) to enrich object variants.\nOur EventAug can facilitate models learning with richer motion patterns, object\nvariants and local spatio-temporal relations, thus improving model robustness\nto varied moving speeds, occlusions, and action disruptions. Experiment results\nshow that our augmentation method consistently yields significant improvements\nacross different tasks and backbones (e.g., a 4.87% accuracy gain on DVS128\nGesture). Our code will be publicly available for this community.\n","authors":["Yukun Tian","Hao Chen","Yongjian Deng","Feihong Shen","Kepan Liu","Wei You","Ziyang Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.11813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.00428v4","updated":"2024-09-18T09:00:30Z","published":"2023-08-01T10:14:43Z","title":"Multiscale Feature Learning Using Co-Tuplet Loss for Offline Handwritten\n Signature Verification","summary":" Handwritten signature verification, crucial for legal and financial\ninstitutions, faces challenges including inter-writer similarity, intra-writer\nvariations, and limited signature samples. To address these, we introduce the\nMultiScale Signature feature learning Network (MS-SigNet) with the co-tuplet\nloss, a novel metric learning loss designed for offline handwritten signature\nverification. MS-SigNet learns both global and regional signature features from\nmultiple spatial scales, enhancing feature discrimination. This approach\neffectively distinguishes genuine signatures from skilled forgeries by\ncapturing overall strokes and detailed local differences. The co-tuplet loss,\nfocusing on multiple positive and negative examples, overcomes the limitations\nof typical metric learning losses by addressing inter-writer similarity and\nintra-writer variations and emphasizing informative examples. We also present\nHanSig, a large-scale Chinese signature dataset to support robust system\ndevelopment for this language. The dataset is accessible at\n\\url{https://github.com/hsinmin/HanSig}. Experimental results on four benchmark\ndatasets in different languages demonstrate the promising performance of our\nmethod in comparison to state-of-the-art approaches.\n","authors":["Fu-Hsien Huang","Hsin-Min Lu"],"pdf_url":"https://arxiv.org/pdf/2308.00428v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11802v1","updated":"2024-09-18T08:35:31Z","published":"2024-09-18T08:35:31Z","title":"Latent fingerprint enhancement for accurate minutiae detection","summary":" Identification of suspects based on partial and smudged fingerprints,\ncommonly referred to as fingermarks or latent fingerprints, presents a\nsignificant challenge in the field of fingerprint recognition. Although\nfixed-length embeddings have shown effectiveness in recognising rolled and slap\nfingerprints, the methods for matching latent fingerprints have primarily\ncentred around local minutiae-based embeddings, failing to fully exploit global\nrepresentations for matching purposes. Consequently, enhancing latent\nfingerprints becomes critical to ensuring robust identification for forensic\ninvestigations. Current approaches often prioritise restoring ridge patterns,\noverlooking the fine-macroeconomic details crucial for accurate fingerprint\nrecognition. To address this, we propose a novel approach that uses generative\nadversary networks (GANs) to redefine Latent Fingerprint Enhancement (LFE)\nthrough a structured approach to fingerprint generation. By directly optimising\nthe minutiae information during the generation process, the model produces\nenhanced latent fingerprints that exhibit exceptional fidelity to ground-truth\ninstances. This leads to a significant improvement in identification\nperformance. Our framework integrates minutiae locations and orientation\nfields, ensuring the preservation of both local and structural fingerprint\nfeatures. Extensive evaluations conducted on two publicly available datasets\ndemonstrate our method's dominance over existing state-of-the-art techniques,\nhighlighting its potential to significantly enhance latent fingerprint\nrecognition accuracy in forensic applications.\n","authors":["Abdul Wahab","Tariq Mahmood Khan","Shahzaib Iqbal","Bandar AlShammari","Bandar Alhaqbani","Imran Razzak"],"pdf_url":"https://arxiv.org/pdf/2409.11802v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.09748v3","updated":"2024-09-18T08:25:18Z","published":"2024-04-15T12:50:44Z","title":"LetsGo: Large-Scale Garage Modeling and Rendering via LiDAR-Assisted\n Gaussian Primitives","summary":" Large garages are ubiquitous yet intricate scenes that present unique\nchallenges due to their monotonous colors, repetitive patterns, reflective\nsurfaces, and transparent vehicle glass. Conventional Structure from Motion\n(SfM) methods for camera pose estimation and 3D reconstruction often fail in\nthese environments due to poor correspondence construction. To address these\nchallenges, we introduce LetsGo, a LiDAR-assisted Gaussian splatting framework\nfor large-scale garage modeling and rendering. We develop a handheld scanner,\nPolar, equipped with IMU, LiDAR, and a fisheye camera, to facilitate accurate\ndata acquisition. Using this Polar device, we present the GarageWorld dataset,\nconsisting of eight expansive garage scenes with diverse geometric structures,\nwhich will be made publicly available for further research. Our approach\ndemonstrates that LiDAR point clouds collected by the Polar device\nsignificantly enhance a suite of 3D Gaussian splatting algorithms for garage\nscene modeling and rendering. We introduce a novel depth regularizer that\neffectively eliminates floating artifacts in rendered images. Additionally, we\npropose a multi-resolution 3D Gaussian representation designed for\nLevel-of-Detail (LOD) rendering. This includes adapted scaling factors for\nindividual levels and a random-resolution-level training scheme to optimize the\nGaussians across different resolutions. This representation enables efficient\nrendering of large-scale garage scenes on lightweight devices via a web-based\nrenderer. Experimental results on our GarageWorld dataset, as well as on\nScanNet++ and KITTI-360, demonstrate the superiority of our method in terms of\nrendering quality and resource efficiency.\n","authors":["Jiadi Cui","Junming Cao","Fuqiang Zhao","Zhipeng He","Yifan Chen","Yuhui Zhong","Lan Xu","Yujiao Shi","Yingliang Zhang","Jingyi Yu"],"pdf_url":"https://arxiv.org/pdf/2404.09748v3.pdf","comment":"Project Page: https://zhaofuq.github.io/LetsGo/"},{"id":"http://arxiv.org/abs/2403.11577v2","updated":"2024-09-18T08:22:57Z","published":"2024-03-18T08:53:03Z","title":"3DGS-Calib: 3D Gaussian Splatting for Multimodal SpatioTemporal\n Calibration","summary":" Reliable multimodal sensor fusion algorithms require accurate spatiotemporal\ncalibration. Recently, targetless calibration techniques based on implicit\nneural representations have proven to provide precise and robust results.\nNevertheless, such methods are inherently slow to train given the high\ncomputational overhead caused by the large number of sampled points required\nfor volume rendering. With the recent introduction of 3D Gaussian Splatting as\na faster alternative to implicit representation methods, we propose to leverage\nthis new rendering approach to achieve faster multi-sensor calibration. We\nintroduce 3DGS-Calib, a new calibration method that relies on the speed and\nrendering accuracy of 3D Gaussian Splatting to achieve multimodal\nspatiotemporal calibration that is accurate, robust, and with a substantial\nspeed-up compared to methods relying on implicit neural representations. We\ndemonstrate the superiority of our proposal with experimental results on\nsequences from KITTI-360, a widely used driving dataset.\n","authors":["Quentin Herau","Moussab Bennehar","Arthur Moreau","Nathan Piasco","Luis Roldao","Dzmitry Tsishkou","Cyrille Migniot","Pascal Vasseur","Cédric Demonceaux"],"pdf_url":"https://arxiv.org/pdf/2403.11577v2.pdf","comment":"Accepted at IROS 2024 (Oral presentation). Project page:\n https://qherau.github.io/3DGS-Calib/"},{"id":"http://arxiv.org/abs/2409.11786v1","updated":"2024-09-18T08:10:35Z","published":"2024-09-18T08:10:35Z","title":"Efficient Low-Resolution Face Recognition via Bridge Distillation","summary":" Face recognition in the wild is now advancing towards light-weight models,\nfast inference speed and resolution-adapted capability. In this paper, we\npropose a bridge distillation approach to turn a complex face model pretrained\non private high-resolution faces into a light-weight one for low-resolution\nface recognition. In our approach, such a cross-dataset resolution-adapted\nknowledge transfer problem is solved via two-step distillation. In the first\nstep, we conduct cross-dataset distillation to transfer the prior knowledge\nfrom private high-resolution faces to public high-resolution faces and generate\ncompact and discriminative features. In the second step, the resolution-adapted\ndistillation is conducted to further transfer the prior knowledge to synthetic\nlow-resolution faces via multi-task learning. By learning low-resolution face\nrepresentations and mimicking the adapted high-resolution knowledge, a\nlight-weight student model can be constructed with high efficiency and\npromising accuracy in recognizing low-resolution faces. Experimental results\nshow that the student model performs impressively in recognizing low-resolution\nfaces with only 0.21M parameters and 0.057MB memory. Meanwhile, its speed\nreaches up to 14,705, ~934 and 763 faces per second on GPU, CPU and mobile\nphone, respectively.\n","authors":["Shiming Ge","Shengwei Zhao","Chenyu Li","Yu Zhang","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2409.11786v1.pdf","comment":"This paper is published in IEEE TIP 2020"},{"id":"http://arxiv.org/abs/2409.11785v1","updated":"2024-09-18T08:09:20Z","published":"2024-09-18T08:09:20Z","title":"Distilling Channels for Efficient Deep Tracking","summary":" Deep trackers have proven success in visual tracking. Typically, these\ntrackers employ optimally pre-trained deep networks to represent all diverse\nobjects with multi-channel features from some fixed layers. The deep networks\nemployed are usually trained to extract rich knowledge from massive data used\nin object classification and so they are capable to represent generic objects\nvery well. However, these networks are too complex to represent a specific\nmoving object, leading to poor generalization as well as high computational and\nmemory costs. This paper presents a novel and general framework termed channel\ndistillation to facilitate deep trackers. To validate the effectiveness of\nchannel distillation, we take discriminative correlation filter (DCF) and ECO\nfor example. We demonstrate that an integrated formulation can turn feature\ncompression, response map generation, and model update into a unified energy\nminimization problem to adaptively select informative feature channels that\nimprove the efficacy of tracking moving objects on the fly. Channel\ndistillation can accurately extract good channels, alleviating the influence of\nnoisy channels and generally reducing the number of channels, as well as\nadaptively generalizing to different channels and networks. The resulting deep\ntracker is accurate, fast, and has low memory requirements. Extensive\nexperimental evaluations on popular benchmarks clearly demonstrate the\neffectiveness and generalizability of our framework.\n","authors":["Shiming Ge","Zhao Luo","Chunhui Zhang","Yingying Hua","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2409.11785v1.pdf","comment":"Published by IEEE TIP 2020"},{"id":"http://arxiv.org/abs/2409.11770v1","updated":"2024-09-18T07:51:38Z","published":"2024-09-18T07:51:38Z","title":"Knowledge Adaptation Network for Few-Shot Class-Incremental Learning","summary":" Few-shot class-incremental learning (FSCIL) aims to incrementally recognize\nnew classes using a few samples while maintaining the performance on previously\nlearned classes. One of the effective methods to solve this challenge is to\nconstruct prototypical evolution classifiers. Despite the advancement achieved\nby most existing methods, the classifier weights are simply initialized using\nmean features. Because representations for new classes are weak and biased, we\nargue such a strategy is suboptimal. In this paper, we tackle this issue from\ntwo aspects. Firstly, thanks to the development of foundation models, we employ\na foundation model, the CLIP, as the network pedestal to provide a general\nrepresentation for each class. Secondly, to generate a more reliable and\ncomprehensive instance representation, we propose a Knowledge Adapter (KA)\nmodule that summarizes the data-specific knowledge from training data and fuses\nit into the general representation. Additionally, to tune the knowledge learned\nfrom the base classes to the upcoming classes, we propose a mechanism of\nIncremental Pseudo Episode Learning (IPEL) by simulating the actual FSCIL.\nTaken together, our proposed method, dubbed as Knowledge Adaptation Network\n(KANet), achieves competitive performance on a wide range of datasets,\nincluding CIFAR100, CUB200, and ImageNet-R.\n","authors":["Ye Wang","Yaxiong Wang","Guoshuai Zhao","Xueming Qian"],"pdf_url":"https://arxiv.org/pdf/2409.11770v1.pdf","comment":"13 pages;6 figures"},{"id":"http://arxiv.org/abs/2407.10195v2","updated":"2024-09-18T07:26:55Z","published":"2024-07-14T13:34:00Z","title":"V2I-Calib: A Novel Calibration Approach for Collaborative Vehicle and\n Infrastructure LiDAR Systems","summary":" Cooperative LiDAR systems integrating vehicles and road infrastructure,\ntermed V2I calibration, exhibit substantial potential, yet their deployment\nencounters numerous challenges. A pivotal aspect of ensuring data accuracy and\nconsistency across such systems involves the calibration of LiDAR units across\nheterogeneous vehicular and infrastructural endpoints. This necessitates the\ndevelopment of calibration methods that are both real-time and robust,\nparticularly those that can ensure robust performance in urban canyon scenarios\nwithout relying on initial positioning values. Accordingly, this paper\nintroduces a novel approach to V2I calibration, leveraging spatial association\ninformation among perceived objects. Central to this method is the innovative\nOverall Intersection over Union (oIoU) metric, which quantifies the correlation\nbetween targets identified by vehicle and infrastructure systems, thereby\nfacilitating the real-time monitoring of calibration results. Our approach\ninvolves identifying common targets within the perception results of vehicle\nand infrastructure LiDAR systems through the construction of an affinity\nmatrix. These common targets then form the basis for the calculation and\noptimization of extrinsic parameters. Comparative and ablation studies\nconducted using the DAIR-V2X dataset substantiate the superiority of our\napproach. For further insights and resources, our project repository is\naccessible at https://github.com/MassimoQu/v2i-calib.\n","authors":["Qianxin Qu","Yijin Xiong","Guipeng Zhang","Xin Wu","Xiaohan Gao","Xin Gao","Hanyu Li","Shichun Guo","Guoying Zhang"],"pdf_url":"https://arxiv.org/pdf/2407.10195v2.pdf","comment":"IROS2024"},{"id":"http://arxiv.org/abs/2409.11750v1","updated":"2024-09-18T07:09:19Z","published":"2024-09-18T07:09:19Z","title":"Neural Encoding for Image Recall: Human-Like Memory","summary":" Achieving human-like memory recall in artificial systems remains a\nchallenging frontier in computer vision. Humans demonstrate remarkable ability\nto recall images after a single exposure, even after being shown thousands of\nimages. However, this capacity diminishes significantly when confronted with\nnon-natural stimuli such as random textures. In this paper, we present a method\ninspired by human memory processes to bridge this gap between artificial and\nbiological memory systems. Our approach focuses on encoding images to mimic the\nhigh-level information retained by the human brain, rather than storing raw\npixel data. By adding noise to images before encoding, we introduce variability\nakin to the non-deterministic nature of human memory encoding. Leveraging\npre-trained models' embedding layers, we explore how different architectures\nencode images and their impact on memory recall. Our method achieves impressive\nresults, with 97% accuracy on natural images and near-random performance (52%)\non textures. We provide insights into the encoding process and its implications\nfor machine learning memory systems, shedding light on the parallels between\nhuman and artificial intelligence memory mechanisms.\n","authors":["Virgile Foussereau","Robin Dumas"],"pdf_url":"https://arxiv.org/pdf/2409.11750v1.pdf","comment":"5 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.11749v1","updated":"2024-09-18T07:08:08Z","published":"2024-09-18T07:08:08Z","title":"RockTrack: A 3D Robust Multi-Camera-Ken Multi-Object Tracking Framework","summary":" 3D Multi-Object Tracking (MOT) obtains significant performance improvements\nwith the rapid advancements in 3D object detection, particularly in\ncost-effective multi-camera setups. However, the prevalent end-to-end training\napproach for multi-camera trackers results in detector-specific models,\nlimiting their versatility. Moreover, current generic trackers overlook the\nunique features of multi-camera detectors, i.e., the unreliability of motion\nobservations and the feasibility of visual information. To address these\nchallenges, we propose RockTrack, a 3D MOT method for multi-camera detectors.\nFollowing the Tracking-By-Detection framework, RockTrack is compatible with\nvarious off-the-shelf detectors. RockTrack incorporates a confidence-guided\npreprocessing module to extract reliable motion and image observations from\ndistinct representation spaces from a single detector. These observations are\nthen fused in an association module that leverages geometric and appearance\ncues to minimize mismatches. The resulting matches are propagated through a\nstaged estimation process, forming the basis for heuristic noise modeling.\nAdditionally, we introduce a novel appearance similarity metric for explicitly\ncharacterizing object affinities in multi-camera settings. RockTrack achieves\nstate-of-the-art performance on the nuScenes vision-only tracking leaderboard\nwith 59.1% AMOTA while demonstrating impressive computational efficiency.\n","authors":["Xiaoyu Li","Peidong Li","Lijun Zhao","Dedong Liu","Jinghan Gao","Xian Wu","Yitao Wu","Dixiao Cui"],"pdf_url":"https://arxiv.org/pdf/2409.11749v1.pdf","comment":"RockTrack establishes a new state-of-the-art with 59.1% AMOTA on the\n nuScenes vision-only test leaderboard with ResNet50-level backbone"},{"id":"http://arxiv.org/abs/2409.11744v1","updated":"2024-09-18T06:56:06Z","published":"2024-09-18T06:56:06Z","title":"Exploring Gaze Pattern in Autistic Children: Clustering, Visualization,\n and Prediction","summary":" Autism Spectrum Disorder (ASD) significantly affects the social and\ncommunication abilities of children, and eye-tracking is commonly used as a\ndiagnostic tool by identifying associated atypical gaze patterns. Traditional\nmethods demand manual identification of Areas of Interest in gaze patterns,\nlowering the performance of gaze behavior analysis in ASD subjects. To tackle\nthis limitation, we propose a novel method to automatically analyze gaze\nbehaviors in ASD children with superior accuracy. To be specific, we first\napply and optimize seven clustering algorithms to automatically group gaze\npoints to compare ASD subjects with typically developing peers. Subsequently,\nwe extract 63 significant features to fully describe the patterns. These\nfeatures can describe correlations between ASD diagnosis and gaze patterns.\nLastly, using these features as prior knowledge, we train multiple predictive\nmachine learning models to predict and diagnose ASD based on their gaze\nbehaviors. To evaluate our method, we apply our method to three ASD datasets.\nThe experimental and visualization results demonstrate the improvements of\nclustering algorithms in the analysis of unique gaze patterns in ASD children.\nAdditionally, these predictive machine learning models achieved\nstate-of-the-art prediction performance ($81\\%$ AUC) in the field of\nautomatically constructed gaze point features for ASD diagnosis. Our code is\navailable at \\url{https://github.com/username/projectname}.\n","authors":["Weiyan Shi","Haihong Zhang","Jin Yang","Ruiqing Ding","YongWei Zhu","Kenny Tsu Wei Choo"],"pdf_url":"https://arxiv.org/pdf/2409.11744v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.11451v3","updated":"2024-09-18T06:53:40Z","published":"2024-06-17T12:03:32Z","title":"CoMT: Chain-of-Medical-Thought Reduces Hallucination in Medical Report\n Generation","summary":" Automatic medical report generation (MRG), which possesses significant\nresearch value as it can aid radiologists in clinical diagnosis and report\ncomposition, has garnered increasing attention. Despite recent progress,\ngenerating accurate reports remains arduous due to the requirement for precise\nclinical comprehension and disease diagnosis inference. Furthermore, owing to\nthe limited accessibility of medical data and the imbalanced distribution of\ndiseases, the underrepresentation of rare diseases in training data makes\nlarge-scale medical visual language models (LVLMs) prone to hallucinations,\nsuch as omissions or fabrications, severely undermining diagnostic performance\nand further intensifying the challenges for MRG in practice. In this study, to\neffectively mitigate hallucinations in medical report generation, we propose a\nchain-of-medical-thought approach (CoMT), which intends to imitate the\ncognitive process of human doctors by decomposing diagnostic procedures. The\nradiological features with different importance are structured into\nfine-grained medical thought chains to enhance the inferential ability during\ndiagnosis, thereby alleviating hallucination problems and enhancing the\ndiagnostic accuracy of MRG. All resources of this work will be released soon.\n","authors":["Yue Jiang","Jiawei Chen","Dingkang Yang","Mingcheng Li","Shunli Wang","Tong Wu","Ke Li","Lihua Zhang"],"pdf_url":"https://arxiv.org/pdf/2406.11451v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01085v2","updated":"2024-09-18T06:43:54Z","published":"2023-04-03T15:42:27Z","title":"Unsupervised Cross-domain Pulmonary Nodule Detection without Source Data","summary":" Cross-domain pulmonary nodule detection suffers from performance degradation\ndue to a large shift of data distributions between the source and target\ndomain. Besides, considering the high cost of medical data annotation, it is\noften assumed that the target images are unlabeled. Existing approaches have\nmade much progress for this unsupervised domain adaptation setting. However,\nthis setting is still rarely plausible in medical applications since the source\nmedical data are often not accessible due to privacy concerns. This motivates\nus to propose a Source-free Unsupervised cross-domain method for Pulmonary\nnodule detection (SUP), named Instance-level Contrastive Instruction\nfine-tuning framework (ICI). It first adapts the source model to the target\ndomain by utilizing instance-level contrastive learning. Then the adapted model\nis trained in a teacher-student interaction manner, and a weighted entropy loss\nis incorporated to further improve the accuracy. We establish a benchmark by\nadapting a pre-trained source model to three popular datasets for pulmonary\nnodule detection. To the best of our knowledge, this represents the first\nexploration of source-free unsupervised domain adaptation in medical image\nobject detection. Our extensive evaluations reveal that SUP-ICI substantially\nsurpasses existing state-of-the-art approaches, achieving FROC score\nimprovements ranging from 8.98% to 16.05%. This breakthrough not only sets a\nnew precedent for domain adaptation techniques in medical imaging but also\nsignificantly advances the field toward overcoming challenges posed by data\nprivacy and availability. Code: https://github.com/Ruixxxx/SFUDA.\n","authors":["Rui Xu","Yong Luo","Bo Du"],"pdf_url":"https://arxiv.org/pdf/2304.01085v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11734v1","updated":"2024-09-18T06:43:40Z","published":"2024-09-18T06:43:40Z","title":"InverseMeetInsert: Robust Real Image Editing via Geometric Accumulation\n Inversion in Guided Diffusion Models","summary":" In this paper, we introduce Geometry-Inverse-Meet-Pixel-Insert, short for\nGEO, an exceptionally versatile image editing technique designed to cater to\ncustomized user requirements at both local and global scales. Our approach\nseamlessly integrates text prompts and image prompts to yield diverse and\nprecise editing outcomes. Notably, our method operates without the need for\ntraining and is driven by two key contributions: (i) a novel geometric\naccumulation loss that enhances DDIM inversion to faithfully preserve pixel\nspace geometry and layout, and (ii) an innovative boosted image prompt\ntechnique that combines pixel-level editing for text-only inversion with latent\nspace geometry guidance for standard classifier-free reversion. Leveraging the\npublicly available Stable Diffusion model, our approach undergoes extensive\nevaluation across various image types and challenging prompt editing scenarios,\nconsistently delivering high-fidelity editing results for real images.\n","authors":["Yan Zheng","Lemeng Wu"],"pdf_url":"https://arxiv.org/pdf/2409.11734v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2407.14007v2","updated":"2024-09-18T06:39:50Z","published":"2024-07-19T03:43:48Z","title":"Multi-modal Relation Distillation for Unified 3D Representation Learning","summary":" Recent advancements in multi-modal pre-training for 3D point clouds have\ndemonstrated promising results by aligning heterogeneous features across 3D\nshapes and their corresponding 2D images and language descriptions. However,\ncurrent straightforward solutions often overlook intricate structural relations\namong samples, potentially limiting the full capabilities of multi-modal\nlearning. To address this issue, we introduce Multi-modal Relation Distillation\n(MRD), a tri-modal pre-training framework, which is designed to effectively\ndistill reputable large Vision-Language Models (VLM) into 3D backbones. MRD\naims to capture both intra-relations within each modality as well as\ncross-relations between different modalities and produce more discriminative 3D\nshape representations. Notably, MRD achieves significant improvements in\ndownstream zero-shot classification tasks and cross-modality retrieval tasks,\ndelivering new state-of-the-art performance.\n","authors":["Huiqun Wang","Yiping Bao","Panwang Pan","Zeming Li","Xiao Liu","Ruijie Yang","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2407.14007v2.pdf","comment":"Accepted by ECCV2024"},{"id":"http://arxiv.org/abs/2409.11729v1","updated":"2024-09-18T06:38:48Z","published":"2024-09-18T06:38:48Z","title":"DETECLAP: Enhancing Audio-Visual Representation Learning with Object\n Information","summary":" Current audio-visual representation learning can capture rough object\ncategories (e.g., ``animals'' and ``instruments''), but it lacks the ability to\nrecognize fine-grained details, such as specific categories like ``dogs'' and\n``flutes'' within animals and instruments. To address this issue, we introduce\nDETECLAP, a method to enhance audio-visual representation learning with object\ninformation. Our key idea is to introduce an audio-visual label prediction loss\nto the existing Contrastive Audio-Visual Masked AutoEncoder to enhance its\nobject awareness. To avoid costly manual annotations, we prepare object labels\nfrom both audio and visual inputs using state-of-the-art language-audio models\nand object detectors. We evaluate the method of audio-visual retrieval and\nclassification using the VGGSound and AudioSet20K datasets. Our method achieves\nimprovements in recall@10 of +1.5% and +1.2% for audio-to-visual and\nvisual-to-audio retrieval, respectively, and an improvement in accuracy of\n+0.6% for audio-visual classification.\n","authors":["Shota Nakada","Taichi Nishimura","Hokuto Munakata","Masayoshi Kondo","Tatsuya Komatsu"],"pdf_url":"https://arxiv.org/pdf/2409.11729v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2409.09605v2","updated":"2024-09-18T06:34:47Z","published":"2024-09-15T04:09:12Z","title":"DreamMover: Leveraging the Prior of Diffusion Models for Image\n Interpolation with Large Motion","summary":" We study the problem of generating intermediate images from image pairs with\nlarge motion while maintaining semantic consistency. Due to the large motion,\nthe intermediate semantic information may be absent in input images. Existing\nmethods either limit to small motion or focus on topologically similar objects,\nleading to artifacts and inconsistency in the interpolation results. To\novercome this challenge, we delve into pre-trained image diffusion models for\ntheir capabilities in semantic cognition and representations, ensuring\nconsistent expression of the absent intermediate semantic representations with\nthe input. To this end, we propose DreamMover, a novel image interpolation\nframework with three main components: 1) A natural flow estimator based on the\ndiffusion model that can implicitly reason about the semantic correspondence\nbetween two images. 2) To avoid the loss of detailed information during fusion,\nour key insight is to fuse information in two parts, high-level space and\nlow-level space. 3) To enhance the consistency between the generated images and\ninput, we propose the self-attention concatenation and replacement approach.\nLastly, we present a challenging benchmark dataset InterpBench to evaluate the\nsemantic consistency of generated results. Extensive experiments demonstrate\nthe effectiveness of our method. Our project is available at\nhttps://dreamm0ver.github.io .\n","authors":["Liao Shen","Tianqi Liu","Huiqiang Sun","Xinyi Ye","Baopu Li","Jianming Zhang","Zhiguo Cao"],"pdf_url":"https://arxiv.org/pdf/2409.09605v2.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2404.02517v3","updated":"2024-09-18T06:33:32Z","published":"2024-04-03T07:10:18Z","title":"HENet: Hybrid Encoding for End-to-end Multi-task 3D Perception from\n Multi-view Cameras","summary":" Three-dimensional perception from multi-view cameras is a crucial component\nin autonomous driving systems, which involves multiple tasks like 3D object\ndetection and bird's-eye-view (BEV) semantic segmentation. To improve\nperception precision, large image encoders, high-resolution images, and\nlong-term temporal inputs have been adopted in recent 3D perception models,\nbringing remarkable performance gains. However, these techniques are often\nincompatible in training and inference scenarios due to computational resource\nconstraints. Besides, modern autonomous driving systems prefer to adopt an\nend-to-end framework for multi-task 3D perception, which can simplify the\noverall system architecture and reduce the implementation complexity. However,\nconflict between tasks often arises when optimizing multiple tasks jointly\nwithin an end-to-end 3D perception model. To alleviate these issues, we present\nan end-to-end framework named HENet for multi-task 3D perception in this paper.\nSpecifically, we propose a hybrid image encoding network, using a large image\nencoder for short-term frames and a small image encoder for long-term temporal\nframes. Then, we introduce a temporal feature integration module based on the\nattention mechanism to fuse the features of different frames extracted by the\ntwo aforementioned hybrid image encoders. Finally, according to the\ncharacteristics of each perception task, we utilize BEV features of different\ngrid sizes, independent BEV encoders, and task decoders for different tasks.\nExperimental results show that HENet achieves state-of-the-art end-to-end\nmulti-task 3D perception results on the nuScenes benchmark, including 3D object\ndetection and BEV semantic segmentation. The source code and models will be\nreleased at https://github.com/VDIGPKU/HENet.\n","authors":["Zhongyu Xia","ZhiWei Lin","Xinhao Wang","Yongtao Wang","Yun Xing","Shengxiang Qi","Nan Dong","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2404.02517v3.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2405.16788v4","updated":"2024-09-18T06:09:04Z","published":"2024-05-27T03:23:25Z","title":"3D Reconstruction with Fast Dipole Sums","summary":" We introduce a method for high-quality 3D reconstruction from multi-view\nimages. Our method uses a new point-based representation, the regularized\ndipole sum, which generalizes the winding number to allow for interpolation of\nper-point attributes in point clouds with noisy or outlier points. Using\nregularized dipole sums, we represent implicit geometry and radiance fields as\nper-point attributes of a dense point cloud, which we initialize from structure\nfrom motion. We additionally derive Barnes-Hut fast summation schemes for\naccelerated forward and adjoint dipole sum queries. These queries facilitate\nthe use of ray tracing to efficiently and differentiably render images with our\npoint-based representations, and thus update their point attributes to optimize\nscene geometry and appearance. We evaluate our method in inverse rendering\napplications against state-of-the-art alternatives, based on ray tracing of\nneural representations or rasterization of Gaussian point-based\nrepresentations. Our method significantly improves 3D reconstruction quality\nand robustness at equal runtimes, while also supporting more general rendering\nmethods such as shadow rays for direct illumination.\n","authors":["Hanyu Chen","Bailey Miller","Ioannis Gkioulekas"],"pdf_url":"https://arxiv.org/pdf/2405.16788v4.pdf","comment":"project page: https://imaging.cs.cmu.edu/fast_dipole_sums"},{"id":"http://arxiv.org/abs/2409.11711v1","updated":"2024-09-18T05:33:42Z","published":"2024-09-18T05:33:42Z","title":"LFIC-DRASC: Deep Light Field Image Compression Using Disentangled\n Representation and Asymmetrical Strip Convolution","summary":" Light-Field (LF) image is emerging 4D data of light rays that is capable of\nrealistically presenting spatial and angular information of 3D scene. However,\nthe large data volume of LF images becomes the most challenging issue in\nreal-time processing, transmission, and storage. In this paper, we propose an\nend-to-end deep LF Image Compression method Using Disentangled Representation\nand Asymmetrical Strip Convolution (LFIC-DRASC) to improve coding efficiency.\nFirstly, we formulate the LF image compression problem as learning a\ndisentangled LF representation network and an image encoding-decoding network.\nSecondly, we propose two novel feature extractors that leverage the structural\nprior of LF data by integrating features across different dimensions.\nMeanwhile, disentangled LF representation network is proposed to enhance the LF\nfeature disentangling and decoupling. Thirdly, we propose the LFIC-DRASC for LF\nimage compression, where two Asymmetrical Strip Convolution (ASC) operators,\ni.e. horizontal and vertical, are proposed to capture long-range correlation in\nLF feature space. These two ASC operators can be combined with the square\nconvolution to further decouple LF features, which enhances the model ability\nin representing intricate spatial relationships. Experimental results\ndemonstrate that the proposed LFIC-DRASC achieves an average of 20.5\\% bit rate\nreductions comparing with the state-of-the-art methods.\n","authors":["Shiyu Feng","Yun Zhang","Linwei Zhu","Sam Kwong"],"pdf_url":"https://arxiv.org/pdf/2409.11711v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11706v1","updated":"2024-09-18T05:16:34Z","published":"2024-09-18T05:16:34Z","title":"RopeBEV: A Multi-Camera Roadside Perception Network in Bird's-Eye-View","summary":" Multi-camera perception methods in Bird's-Eye-View (BEV) have gained wide\napplication in autonomous driving. However, due to the differences between\nroadside and vehicle-side scenarios, there currently lacks a multi-camera BEV\nsolution in roadside. This paper systematically analyzes the key challenges in\nmulti-camera BEV perception for roadside scenarios compared to vehicle-side.\nThese challenges include the diversity in camera poses, the uncertainty in\nCamera numbers, the sparsity in perception regions, and the ambiguity in\norientation angles. In response, we introduce RopeBEV, the first dense\nmulti-camera BEV approach. RopeBEV introduces BEV augmentation to address the\ntraining balance issues caused by diverse camera poses. By incorporating\nCamMask and ROIMask (Region of Interest Mask), it supports variable camera\nnumbers and sparse perception, respectively. Finally, camera rotation embedding\nis utilized to resolve orientation ambiguity. Our method ranks 1st on the\nreal-world highway dataset RoScenes and demonstrates its practical value on a\nprivate urban dataset that covers more than 50 intersections and 600 cameras.\n","authors":["Jinrang Jia","Guangqi Yi","Yifeng Shi"],"pdf_url":"https://arxiv.org/pdf/2409.11706v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18330v3","updated":"2024-09-18T04:54:28Z","published":"2024-03-27T08:11:25Z","title":"Tracking-Assisted Object Detection with Event Cameras","summary":" Event-based object detection has recently garnered attention in the computer\nvision community due to the exceptional properties of event cameras, such as\nhigh dynamic range and no motion blur. However, feature asynchronism and\nsparsity cause invisible objects due to no relative motion to the camera,\nposing a significant challenge in the task. Prior works have studied various\nimplicit-learned memories to retain as many temporal cues as possible. However,\nimplicit memories still struggle to preserve long-term features effectively. In\nthis paper, we consider those invisible objects as pseudo-occluded objects and\naim to detect them by tracking through occlusions. Firstly, we introduce the\nvisibility attribute of objects and contribute an auto-labeling algorithm to\nnot only clean the existing event camera dataset but also append additional\nvisibility labels to it. Secondly, we exploit tracking strategies for\npseudo-occluded objects to maintain their permanence and retain their bounding\nboxes, even when features have not been available for a very long time. These\nstrategies can be treated as an explicit-learned memory guided by the tracking\nobjective to record the displacements of objects across frames. Lastly, we\npropose a spatio-temporal feature aggregation module to enrich the latent\nfeatures and a consistency loss to increase the robustness of the overall\npipeline. We conduct comprehensive experiments to verify our method's\neffectiveness where still objects are retained, but real occluded objects are\ndiscarded. The results demonstrate that (1) the additional visibility labels\ncan assist in supervised training, and (2) our method outperforms\nstate-of-the-art approaches with a significant improvement of 7.9% absolute\nmAP.\n","authors":["Ting-Kang Yen","Igor Morawski","Shusil Dangi","Kai He","Chung-Yi Lin","Jia-Fong Yeh","Hung-Ting Su","Winston Hsu"],"pdf_url":"https://arxiv.org/pdf/2403.18330v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11702v1","updated":"2024-09-18T04:53:38Z","published":"2024-09-18T04:53:38Z","title":"Discovering Conceptual Knowledge with Analytic Ontology Templates for\n Articulated Objects","summary":" Human cognition can leverage fundamental conceptual knowledge, like geometric\nand kinematic ones, to appropriately perceive, comprehend and interact with\nnovel objects. Motivated by this finding, we aim to endow machine intelligence\nwith an analogous capability through performing at the conceptual level, in\norder to understand and then interact with articulated objects, especially for\nthose in novel categories, which is challenging due to the intricate geometric\nstructures and diverse joint types of articulated objects. To achieve this\ngoal, we propose Analytic Ontology Template (AOT), a parameterized and\ndifferentiable program description of generalized conceptual ontologies. A\nbaseline approach called AOTNet driven by AOTs is designed accordingly to equip\nintelligent agents with these generalized concepts, and then empower the agents\nto effectively discover the conceptual knowledge on the structure and\naffordance of articulated objects. The AOT-driven approach yields benefits in\nthree key perspectives: i) enabling concept-level understanding of articulated\nobjects without relying on any real training data, ii) providing analytic\nstructure information, and iii) introducing rich affordance information\nindicating proper ways of interaction. We conduct exhaustive experiments and\nthe results demonstrate the superiority of our approach in understanding and\nthen interacting with articulated objects.\n","authors":["Jianhua Sun","Yuxuan Li","Longfei Xu","Jiude Wei","Liang Chai","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2409.11702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17814v5","updated":"2024-09-18T04:40:40Z","published":"2024-05-28T04:18:00Z","title":"FAIntbench: A Holistic and Precise Benchmark for Bias Evaluation in\n Text-to-Image Models","summary":" The rapid development and reduced barriers to entry for Text-to-Image (T2I)\nmodels have raised concerns about the biases in their outputs, but existing\nresearch lacks a holistic definition and evaluation framework of biases,\nlimiting the enhancement of debiasing techniques. To address this issue, we\nintroduce FAIntbench, a holistic and precise benchmark for biases in T2I\nmodels. In contrast to existing benchmarks that evaluate bias in limited\naspects, FAIntbench evaluate biases from four dimensions: manifestation of\nbias, visibility of bias, acquired attributes, and protected attributes. We\napplied FAIntbench to evaluate seven recent large-scale T2I models and\nconducted human evaluation, whose results demonstrated the effectiveness of\nFAIntbench in identifying various biases. Our study also revealed new research\nquestions about biases, including the side-effect of distillation. The findings\npresented here are preliminary, highlighting the potential of FAIntbench to\nadvance future research aimed at mitigating the biases in T2I models. Our\nbenchmark is publicly available to ensure the reproducibility.\n","authors":["Hanjun Luo","Ziye Deng","Ruizhe Chen","Zuozhu Liu"],"pdf_url":"https://arxiv.org/pdf/2405.17814v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11692v1","updated":"2024-09-18T04:21:04Z","published":"2024-09-18T04:21:04Z","title":"ORB-SfMLearner: ORB-Guided Self-supervised Visual Odometry with\n Selective Online Adaptation","summary":" Deep visual odometry, despite extensive research, still faces limitations in\naccuracy and generalizability that prevent its broader application. To address\nthese challenges, we propose an Oriented FAST and Rotated BRIEF (ORB)-guided\nvisual odometry with selective online adaptation named ORB-SfMLearner. We\npresent a novel use of ORB features for learning-based ego-motion estimation,\nleading to more robust and accurate results. We also introduce the\ncross-attention mechanism to enhance the explainability of PoseNet and have\nrevealed that driving direction of the vehicle can be explained through\nattention weights, marking a novel exploration in this area. To improve\ngeneralizability, our selective online adaptation allows the network to rapidly\nand selectively adjust to the optimal parameters across different domains.\nExperimental results on KITTI and vKITTI datasets show that our method\noutperforms previous state-of-the-art deep visual odometry methods in terms of\nego-motion accuracy and generalizability.\n","authors":["Yanlin Jin","Rui-Yang Ju","Haojun Liu","Yuzhong Zhong"],"pdf_url":"https://arxiv.org/pdf/2409.11692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11689v1","updated":"2024-09-18T04:05:59Z","published":"2024-09-18T04:05:59Z","title":"GUNet: A Graph Convolutional Network United Diffusion Model for Stable\n and Diversity Pose Generation","summary":" Pose skeleton images are an important reference in pose-controllable image\ngeneration. In order to enrich the source of skeleton images, recent works have\ninvestigated the generation of pose skeletons based on natural language. These\nmethods are based on GANs. However, it remains challenging to perform diverse,\nstructurally correct and aesthetically pleasing human pose skeleton generation\nwith various textual inputs. To address this problem, we propose a framework\nwith GUNet as the main model, PoseDiffusion. It is the first generative\nframework based on a diffusion model and also contains a series of variants\nfine-tuned based on a stable diffusion model. PoseDiffusion demonstrates\nseveral desired properties that outperform existing methods. 1) Correct\nSkeletons. GUNet, a denoising model of PoseDiffusion, is designed to\nincorporate graphical convolutional neural networks. It is able to learn the\nspatial relationships of the human skeleton by introducing skeletal information\nduring the training process. 2) Diversity. We decouple the key points of the\nskeleton and characterise them separately, and use cross-attention to introduce\ntextual conditions. Experimental results show that PoseDiffusion outperforms\nexisting SoTA algorithms in terms of stability and diversity of text-driven\npose skeleton generation. Qualitative analyses further demonstrate its\nsuperiority for controllable generation in Stable Diffusion.\n","authors":["Shuowen Liang","Sisi Li","Qingyun Wang","Cen Zhang","Kaiquan Zhu","Tian Yang"],"pdf_url":"https://arxiv.org/pdf/2409.11689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11688v1","updated":"2024-09-18T04:00:54Z","published":"2024-09-18T04:00:54Z","title":"SLAM assisted 3D tracking system for laparoscopic surgery","summary":" A major limitation of minimally invasive surgery is the difficulty in\naccurately locating the internal anatomical structures of the target organ due\nto the lack of tactile feedback and transparency. Augmented reality (AR) offers\na promising solution to overcome this challenge. Numerous studies have shown\nthat combining learning-based and geometric methods can achieve accurate\npreoperative and intraoperative data registration. This work proposes a\nreal-time monocular 3D tracking algorithm for post-registration tasks. The\nORB-SLAM2 framework is adopted and modified for prior-based 3D tracking. The\nprimitive 3D shape is used for fast initialization of the monocular SLAM. A\npseudo-segmentation strategy is employed to separate the target organ from the\nbackground for tracking purposes, and the geometric prior of the 3D shape is\nincorporated as an additional constraint in the pose graph. Experiments from\nin-vivo and ex-vivo tests demonstrate that the proposed 3D tracking system\nprovides robust 3D tracking and effectively handles typical challenges such as\nfast motion, out-of-field-of-view scenarios, partial visibility, and\n\"organ-background\" relative motion.\n","authors":["Jingwei Song","Ray Zhang","Wenwei Zhang","Hao Zhou","Maani Ghaffari"],"pdf_url":"https://arxiv.org/pdf/2409.11688v1.pdf","comment":"Demo: https://youtu.be/B1xZW8bj3cM"},{"id":"http://arxiv.org/abs/2409.11686v1","updated":"2024-09-18T03:56:56Z","published":"2024-09-18T03:56:56Z","title":"Detecting Underdiagnosed Medical Conditions with Deep Learning-Based\n Opportunistic CT Imaging","summary":" Abdominal computed tomography (CT) scans are frequently performed in clinical\nsettings. Opportunistic CT involves repurposing routine CT images to extract\ndiagnostic information and is an emerging tool for detecting underdiagnosed\nconditions such as sarcopenia, hepatic steatosis, and ascites. This study\nutilizes deep learning methods to promote accurate diagnosis and clinical\ndocumentation. We analyze 2,674 inpatient CT scans to identify discrepancies\nbetween imaging phenotypes (characteristics derived from opportunistic CT\nscans) and their corresponding documentation in radiology reports and ICD\ncoding. Through our analysis, we find that only 0.5%, 3.2%, and 30.7% of scans\ndiagnosed with sarcopenia, hepatic steatosis, and ascites (respectively)\nthrough either opportunistic imaging or radiology reports were ICD-coded. Our\nfindings demonstrate opportunistic CT's potential to enhance diagnostic\nprecision and accuracy of risk adjustment models, offering advancements in\nprecision medicine.\n","authors":["Asad Aali","Andrew Johnston","Louis Blankemeier","Dave Van Veen","Laura T Derry","David Svec","Jason Hom","Robert D. Boutin","Akshay S. Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2409.11686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11682v1","updated":"2024-09-18T03:47:24Z","published":"2024-09-18T03:47:24Z","title":"SRIF: Semantic Shape Registration Empowered by Diffusion-based Image\n Morphing and Flow Estimation","summary":" In this paper, we propose SRIF, a novel Semantic shape Registration framework\nbased on diffusion-based Image morphing and Flow estimation. More concretely,\ngiven a pair of extrinsically aligned shapes, we first render them from\nmulti-views, and then utilize an image interpolation framework based on\ndiffusion models to generate sequences of intermediate images between them. The\nimages are later fed into a dynamic 3D Gaussian splatting framework, with which\nwe reconstruct and post-process for intermediate point clouds respecting the\nimage morphing processing. In the end, tailored for the above, we propose a\nnovel registration module to estimate continuous normalizing flow, which\ndeforms source shape consistently towards the target, with intermediate point\nclouds as weak guidance. Our key insight is to leverage large vision models\n(LVMs) to associate shapes and therefore obtain much richer semantic\ninformation on the relationship between shapes than the ad-hoc feature\nextraction and alignment. As a consequence, SRIF achieves high-quality dense\ncorrespondences on challenging shape pairs, but also delivers smooth,\nsemantically meaningful interpolation in between. Empirical evidence justifies\nthe effectiveness and superiority of our method as well as specific design\nchoices. The code is released at https://github.com/rqhuang88/SRIF.\n","authors":["Mingze Sun","Chen Guo","Puhua Jiang","Shiwei Mao","Yurun Chen","Ruqi Huang"],"pdf_url":"https://arxiv.org/pdf/2409.11682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11681v1","updated":"2024-09-18T03:45:44Z","published":"2024-09-18T03:45:44Z","title":"Gradient-Driven 3D Segmentation and Affordance Transfer in Gaussian\n Splatting Using 2D Masks","summary":" 3D Gaussian Splatting has emerged as a powerful 3D scene representation\ntechnique, capturing fine details with high efficiency. In this paper, we\nintroduce a novel voting-based method that extends 2D segmentation models to 3D\nGaussian splats. Our approach leverages masked gradients, where gradients are\nfiltered by input 2D masks, and these gradients are used as votes to achieve\naccurate segmentation. As a byproduct, we discovered that inference-time\ngradients can also be used to prune Gaussians, resulting in up to 21%\ncompression. Additionally, we explore few-shot affordance transfer, allowing\nannotations from 2D images to be effectively transferred onto 3D Gaussian\nsplats. The robust yet straightforward mathematical formulation underlying this\napproach makes it a highly effective tool for numerous downstream applications,\nsuch as augmented reality (AR), object editing, and robotics. The project code\nand additional resources are available at\nhttps://jojijoseph.github.io/3dgs-segmentation.\n","authors":["Joji Joseph","Bharadwaj Amrutur","Shalabh Bhatnagar"],"pdf_url":"https://arxiv.org/pdf/2409.11681v1.pdf","comment":"Preprint, Under review for ICRA 2025"},{"id":"http://arxiv.org/abs/2409.11664v1","updated":"2024-09-18T03:02:19Z","published":"2024-09-18T03:02:19Z","title":"Agent Aggregator with Mask Denoise Mechanism for Histopathology Whole\n Slide Image Analysis","summary":" Histopathology analysis is the gold standard for medical diagnosis. Accurate\nclassification of whole slide images (WSIs) and region-of-interests (ROIs)\nlocalization can assist pathologists in diagnosis. The gigapixel resolution of\nWSI and the absence of fine-grained annotations make direct classification and\nanalysis challenging. In weakly supervised learning, multiple instance learning\n(MIL) presents a promising approach for WSI classification. The prevailing\nstrategy is to use attention mechanisms to measure instance importance for\nclassification. However, attention mechanisms fail to capture inter-instance\ninformation, and self-attention causes quadratic computational complexity. To\naddress these challenges, we propose AMD-MIL, an agent aggregator with a mask\ndenoise mechanism. The agent token acts as an intermediate variable between the\nquery and key for computing instance importance. Mask and denoising matrices,\nmapped from agents-aggregated value, dynamically mask low-contribution\nrepresentations and eliminate noise. AMD-MIL achieves better attention\nallocation by adjusting feature representations, capturing micro-metastases in\ncancer, and improving interpretability. Extensive experiments on CAMELYON-16,\nCAMELYON-17, TCGA-KIDNEY, and TCGA-LUNG show AMD-MIL's superiority over\nstate-of-the-art methods.\n","authors":["Xitong Ling","Minxi Ouyang","Yizhi Wang","Xinrui Chen","Renao Yan","Hongbo Chu","Junru Cheng","Tian Guan","Sufang Tian","Xiaoping Liu","Yonghong He"],"pdf_url":"https://arxiv.org/pdf/2409.11664v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11661v1","updated":"2024-09-18T02:56:50Z","published":"2024-09-18T02:56:50Z","title":"Bridging Domain Gap for Flight-Ready Spaceborne Vision","summary":" This work presents Spacecraft Pose Network v3 (SPNv3), a Neural Network (NN)\nfor monocular pose estimation of a known, non-cooperative target spacecraft. As\nopposed to existing literature, SPNv3 is designed and trained to be\ncomputationally efficient while providing robustness to spaceborne images that\nhave not been observed during offline training and validation on the ground.\nThese characteristics are essential to deploying NNs on space-grade edge\ndevices. They are achieved through careful NN design choices, and an extensive\ntrade-off analysis reveals features such as data augmentation, transfer\nlearning and vision transformer architecture as a few of those that contribute\nto simultaneously maximizing robustness and minimizing computational overhead.\nExperiments demonstrate that the final SPNv3 can achieve state-of-the-art pose\naccuracy on hardware-in-the-loop images from a robotic testbed while having\ntrained exclusively on computer-generated synthetic images, effectively\nbridging the domain gap between synthetic and real imagery. At the same time,\nSPNv3 runs well above the update frequency of modern satellite navigation\nfilters when tested on a representative graphical processing unit system with\nflight heritage. Overall, SPNv3 is an efficient, flight-ready NN model readily\napplicable to a wide range of close-range rendezvous and proximity operations\nwith target resident space objects. The code implementation of SPNv3 will be\nmade publicly available.\n","authors":["Tae Ha Park","Simone D'Amico"],"pdf_url":"https://arxiv.org/pdf/2409.11661v1.pdf","comment":"Submitted to Journal of Spacecraft and Rockets; Appeared as Chapter 4\n of Tae Ha Park's PhD thesis"},{"id":"http://arxiv.org/abs/2409.11656v1","updated":"2024-09-18T02:46:28Z","published":"2024-09-18T02:46:28Z","title":"VL-Reader: Vision and Language Reconstructor is an Effective Scene Text\n Recognizer","summary":" Text recognition is an inherent integration of vision and language,\nencompassing the visual texture in stroke patterns and the semantic context\namong the character sequences. Towards advanced text recognition, there are\nthree key challenges: (1) an encoder capable of representing the visual and\nsemantic distributions; (2) a decoder that ensures the alignment between vision\nand semantics; and (3) consistency in the framework during pre-training, if it\nexists, and fine-tuning. Inspired by masked autoencoding, a successful\npre-training strategy in both vision and language, we propose an innovative\nscene text recognition approach, named VL-Reader. The novelty of the VL-Reader\nlies in the pervasive interplay between vision and language throughout the\nentire process. Concretely, we first introduce a Masked Visual-Linguistic\nReconstruction (MVLR) objective, which aims at simultaneously modeling visual\nand linguistic information. Then, we design a Masked Visual-Linguistic Decoder\n(MVLD) to further leverage masked vision-language context and achieve bi-modal\nfeature interaction. The architecture of VL-Reader maintains consistency from\npre-training to fine-tuning. In the pre-training stage, VL-Reader reconstructs\nboth masked visual and text tokens, while in the fine-tuning stage, the network\ndegrades to reconstruct all characters from an image without any masked\nregions. VL-reader achieves an average accuracy of 97.1% on six typical\ndatasets, surpassing the SOTA by 1.1%. The improvement was even more\nsignificant on challenging datasets. The results demonstrate that vision and\nlanguage reconstructor can serve as an effective scene text recognizer.\n","authors":["Humen Zhong","Zhibo Yang","Zhaohai Li","Peng Wang","Jun Tang","Wenqing Cheng","Cong Yao"],"pdf_url":"https://arxiv.org/pdf/2409.11656v1.pdf","comment":"Accepted by ACM-MM2024"},{"id":"http://arxiv.org/abs/2409.11653v1","updated":"2024-09-18T02:40:31Z","published":"2024-09-18T02:40:31Z","title":"Enhancing Semi-Supervised Learning via Representative and Diverse Sample\n Selection","summary":" Semi-Supervised Learning (SSL) has become a preferred paradigm in many deep\nlearning tasks, which reduces the need for human labor. Previous studies\nprimarily focus on effectively utilising the labelled and unlabeled data to\nimprove performance. However, we observe that how to select samples for\nlabelling also significantly impacts performance, particularly under extremely\nlow-budget settings. The sample selection task in SSL has been under-explored\nfor a long time. To fill in this gap, we propose a Representative and Diverse\nSample Selection approach (RDSS). By adopting a modified Frank-Wolfe algorithm\nto minimise a novel criterion $\\alpha$-Maximum Mean Discrepancy ($\\alpha$-MMD),\nRDSS samples a representative and diverse subset for annotation from the\nunlabeled data. We demonstrate that minimizing $\\alpha$-MMD enhances the\ngeneralization ability of low-budget learning. Experimental results show that\nRDSS consistently improves the performance of several popular SSL frameworks\nand outperforms the state-of-the-art sample selection approaches used in Active\nLearning (AL) and Semi-Supervised Active Learning (SSAL), even with constrained\nannotation budgets.\n","authors":["Qian Shao","Jiangrui Kang","Qiyuan Chen","Zepeng Li","Hongxia Xu","Yiwen Cao","Jiajuan Liang","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2409.11653v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2409.11652v1","updated":"2024-09-18T02:37:04Z","published":"2024-09-18T02:37:04Z","title":"Relax DARTS: Relaxing the Constraints of Differentiable Architecture\n Search for Eye Movement Recognition","summary":" Eye movement biometrics is a secure and innovative identification method.\nDeep learning methods have shown good performance, but their network\narchitecture relies on manual design and combined priori knowledge. To address\nthese issues, we introduce automated network search (NAS) algorithms to the\nfield of eye movement recognition and present Relax DARTS, which is an\nimprovement of the Differentiable Architecture Search (DARTS) to realize more\nefficient network search and training. The key idea is to circumvent the issue\nof weight sharing by independently training the architecture parameters\n$\\alpha$ to achieve a more precise target architecture. Moreover, the\nintroduction of module input weights $\\beta$ allows cells the flexibility to\nselect inputs, to alleviate the overfitting phenomenon and improve the model\nperformance. Results on four public databases demonstrate that the Relax DARTS\nachieves state-of-the-art recognition performance. Notably, Relax DARTS\nexhibits adaptability to other multi-feature temporal classification tasks.\n","authors":["Hongyu Zhu","Xin Jin","Hongchao Liao","Yan Xiang","Mounim A. El-Yacoubi","Huafeng Qin"],"pdf_url":"https://arxiv.org/pdf/2409.11652v1.pdf","comment":"Accepted By CCBR 2024"},{"id":"http://arxiv.org/abs/2404.17917v3","updated":"2024-09-18T02:26:56Z","published":"2024-04-27T14:10:09Z","title":"EvaNet: Elevation-Guided Flood Extent Mapping on Earth Imagery (Extended\n Version)","summary":" Accurate and timely mapping of flood extent from high-resolution satellite\nimagery plays a crucial role in disaster management such as damage assessment\nand relief activities. However, current state-of-the-art solutions are based on\nU-Net, which can-not segment the flood pixels accurately due to the ambiguous\npixels (e.g., tree canopies, clouds) that prevent a direct judgement from only\nthe spectral features. Thanks to the digital elevation model (DEM) data readily\navailable from sources such as United States Geological Survey (USGS), this\nwork explores the use of an elevation map to improve flood extent mapping. We\npropose, EvaNet, an elevation-guided segmentation model based on the\nencoder-decoder architecture with two novel techniques: (1) a loss function\nencoding the physical law of gravity that if a location is flooded (resp. dry),\nthen its adjacent locations with a lower (resp. higher) elevation must also be\nflooded (resp. dry); (2) a new (de)convolution operation that integrates the\nelevation map by a location sensitive gating mechanism to regulate how much\nspectral features flow through adjacent layers. Extensive experiments show that\nEvaNet significantly outperforms the U-Net baselines, and works as a perfect\ndrop-in replacement for U-Net in existing solutions to flood extent mapping.\n","authors":["Mirza Tanzim Sami","Da Yan","Saugat Adhikari","Lyuheng Yuan","Jiao Han","Zhe Jiang","Jalal Khalil","Yang Zhou"],"pdf_url":"https://arxiv.org/pdf/2404.17917v3.pdf","comment":"Published at the International Joint Conference on Artificial\n Intelligence (IJCAI, 2024)"},{"id":"http://arxiv.org/abs/2409.11644v1","updated":"2024-09-18T02:15:01Z","published":"2024-09-18T02:15:01Z","title":"Few-Shot Learning Approach on Tuberculosis Classification Based on Chest\n X-Ray Images","summary":" Tuberculosis (TB) is caused by the bacterium Mycobacterium tuberculosis,\nprimarily affecting the lungs. Early detection is crucial for improving\ntreatment effectiveness and reducing transmission risk. Artificial intelligence\n(AI), particularly through image classification of chest X-rays, can assist in\nTB detection. However, class imbalance in TB chest X-ray datasets presents a\nchallenge for accurate classification. In this paper, we propose a few-shot\nlearning (FSL) approach using the Prototypical Network algorithm to address\nthis issue. We compare the performance of ResNet-18, ResNet-50, and VGG16 in\nfeature extraction from the TBX11K Chest X-ray dataset. Experimental results\ndemonstrate classification accuracies of 98.93% for ResNet-18, 98.60% for\nResNet-50, and 33.33% for VGG16. These findings indicate that the proposed\nmethod outperforms others in mitigating data imbalance, which is particularly\nbeneficial for disease classification applications.\n","authors":["A. A. G. Yogi Pramana","Faiz Ihza Permana","Muhammad Fazil Maulana","Dzikri Rahadian Fudholi"],"pdf_url":"https://arxiv.org/pdf/2409.11644v1.pdf","comment":"6 pages. Pre-print"},{"id":"http://arxiv.org/abs/2407.21497v2","updated":"2024-09-18T02:14:37Z","published":"2024-07-31T10:11:57Z","title":"Mitral Regurgitation Recogniton based on Unsupervised\n Out-of-Distribution Detection with Residual Diffusion Amplification","summary":" Mitral regurgitation (MR) is a serious heart valve disease. Early and\naccurate diagnosis of MR via ultrasound video is critical for timely clinical\ndecision-making and surgical intervention. However, manual MR diagnosis heavily\nrelies on the operator's experience, which may cause misdiagnosis and\ninter-observer variability. Since MR data is limited and has large intra-class\nvariability, we propose an unsupervised out-of-distribution (OOD) detection\nmethod to identify MR rather than building a deep classifier. To our knowledge,\nwe are the first to explore OOD in MR ultrasound videos. Our method consists of\na feature extractor, a feature reconstruction model, and a residual\naccumulation amplification algorithm. The feature extractor obtains features\nfrom the video clips and feeds them into the feature reconstruction model to\nrestore the original features. The residual accumulation amplification\nalgorithm then iteratively performs noise feature reconstruction, amplifying\nthe reconstructed error of OOD features. This algorithm is straightforward yet\nefficient and can seamlessly integrate as a plug-and-play component in\nreconstruction-based OOD detection methods. We validated the proposed method on\na large ultrasound dataset containing 893 non-MR and 267 MR videos.\nExperimental results show that our OOD detection method can effectively\nidentify MR samples.\n","authors":["Zhe Liu","Xiliang Zhu","Tong Han","Yuhao Huang","Jian Wang","Lian Liu","Fang Wang","Dong Ni","Zhongshan Gou","Xin Yang"],"pdf_url":"https://arxiv.org/pdf/2407.21497v2.pdf","comment":"Accepted by MICCAI MLMI 2024, 11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.11642v1","updated":"2024-09-18T02:14:08Z","published":"2024-09-18T02:14:08Z","title":"DAF-Net: A Dual-Branch Feature Decomposition Fusion Network with Domain\n Adaptive for Infrared and Visible Image Fusion","summary":" Infrared and visible image fusion aims to combine complementary information\nfrom both modalities to provide a more comprehensive scene understanding.\nHowever, due to the significant differences between the two modalities,\npreserving key features during the fusion process remains a challenge. To\naddress this issue, we propose a dual-branch feature decomposition fusion\nnetwork (DAF-Net) with domain adaptive, which introduces Multi-Kernel Maximum\nMean Discrepancy (MK-MMD) into the base encoder and designs a hybrid kernel\nfunction suitable for infrared and visible image fusion. The base encoder built\non the Restormer network captures global structural information while the\ndetail encoder based on Invertible Neural Networks (INN) focuses on extracting\ndetail texture information. By incorporating MK-MMD, the DAF-Net effectively\naligns the latent feature spaces of visible and infrared images, thereby\nimproving the quality of the fused images. Experimental results demonstrate\nthat the proposed method outperforms existing techniques across multiple\ndatasets, significantly enhancing both visual quality and fusion performance.\nThe related Python code is available at https://github.com/xujian000/DAF-Net.\n","authors":["Jian Xu","Xin He"],"pdf_url":"https://arxiv.org/pdf/2409.11642v1.pdf","comment":"5pages,4figures"},{"id":"http://arxiv.org/abs/2409.11635v1","updated":"2024-09-18T01:55:00Z","published":"2024-09-18T01:55:00Z","title":"PainDiffusion: Can robot express pain?","summary":" Pain is a more intuitive and user-friendly way of communicating problems,\nmaking it especially useful in rehabilitation nurse training robots. While most\nprevious methods have focused on classifying or recognizing pain expressions,\nthese approaches often result in unnatural, jiggling robot faces. We introduce\nPainDiffusion, a model that generates facial expressions in response to pain\nstimuli, with controllable pain expressiveness and emotion status.\nPainDiffusion leverages diffusion forcing to roll out predictions over\narbitrary lengths using a conditioned temporal U-Net. It operates as a latent\ndiffusion model within EMOCA's facial expression latent space, ensuring a\ncompact data representation and quick rendering time. For training data, we\nprocess the BioVid Heatpain Database, extracting expression codes and subject\nidentity configurations. We also propose a novel set of metrics to evaluate\npain expressions, focusing on expressiveness, diversity, and the\nappropriateness of model-generated outputs. Finally, we demonstrate that\nPainDiffusion outperforms the autoregressive method, both qualitatively and\nquantitatively. Code, videos, and further analysis are available at:\n\\href{https://damtien444.github.io/paindf/}{https://damtien444.github.io/paindf/}.\n","authors":["Quang Tien Dam","Tri Tung Nguyen Nguyen","Dinh Tuan Tran","Joo-Ho Lee"],"pdf_url":"https://arxiv.org/pdf/2409.11635v1.pdf","comment":"Under reviewing"},{"id":"http://arxiv.org/abs/2308.00727v2","updated":"2024-09-18T01:24:09Z","published":"2023-08-01T15:37:19Z","title":"Adaptive Semantic Consistency for Cross-domain Few-shot Classification","summary":" Cross-domain few-shot classification (CD-FSC) aims to identify novel target\nclasses with a few samples, assuming that there exists a domain shift between\nsource and target domains. Existing state-of-the-art practices typically\npre-train on source domain and then finetune on the few-shot target data to\nyield task-adaptive representations. Despite promising progress, these methods\nare prone to overfitting the limited target distribution since data-scarcity\nand ignore the transferable knowledge learned in the source domain. To\nalleviate this problem, we propose a simple plug-and-play Adaptive Semantic\nConsistency (ASC) framework, which improves cross-domain robustness by\npreserving source transfer capability during the finetuning stage. Concretely,\nwe reuse the source images in the pretraining phase and design an adaptive\nweight assignment strategy to highlight the samples similar to target domain,\naiming to aggregate informative target-related knowledge from source domain.\nSubsequently, a semantic consistency regularization is applied to constrain the\nconsistency between the semantic features of the source images output by the\nsource model and target model. In this way, the proposed ASC enables explicit\ntransfer of source domain knowledge to prevent the model from overfitting the\ntarget domain. Extensive experiments on multiple benchmarks demonstrate the\neffectiveness of the proposed ASC, and ASC provides consistent improvements\nover the baselines. The source code is released at\nhttps://github.com/luhc666/ASC-CDFSL.\n","authors":["Hengchu Lu","Yuanjie Shao","Xiang Wang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2308.00727v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11624v1","updated":"2024-09-18T01:08:49Z","published":"2024-09-18T01:08:49Z","title":"Multimodal Generalized Category Discovery","summary":" Generalized Category Discovery (GCD) aims to classify inputs into both known\nand novel categories, a task crucial for open-world scientific discoveries.\nHowever, current GCD methods are limited to unimodal data, overlooking the\ninherently multimodal nature of most real-world data. In this work, we extend\nGCD to a multimodal setting, where inputs from different modalities provide\nricher and complementary information. Through theoretical analysis and\nempirical validation, we identify that the key challenge in multimodal GCD lies\nin effectively aligning heterogeneous information across modalities. To address\nthis, we propose MM-GCD, a novel framework that aligns both the feature and\noutput spaces of different modalities using contrastive learning and\ndistillation techniques. MM-GCD achieves new state-of-the-art performance on\nthe UPMC-Food101 and N24News datasets, surpassing previous methods by 11.5\\%\nand 4.7\\%, respectively.\n","authors":["Yuchang Su","Renping Zhou","Siyu Huang","Xingjian Li","Tianyang Wang","Ziyue Wang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2409.11624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11619v1","updated":"2024-09-18T00:51:01Z","published":"2024-09-18T00:51:01Z","title":"Hyperspectral Image Classification Based on Faster Residual Multi-branch\n Spiking Neural Network","summary":" Convolutional neural network (CNN) performs well in Hyperspectral Image (HSI)\nclassification tasks, but its high energy consumption and complex network\nstructure make it difficult to directly apply it to edge computing devices. At\npresent, spiking neural networks (SNN) have developed rapidly in HSI\nclassification tasks due to their low energy consumption and event driven\ncharacteristics. However, it usually requires a longer time step to achieve\noptimal accuracy. In response to the above problems, this paper builds a\nspiking neural network (SNN-SWMR) based on the leaky integrate-and-fire (LIF)\nneuron model for HSI classification tasks. The network uses the spiking width\nmixed residual (SWMR) module as the basic unit to perform feature extraction\noperations. The spiking width mixed residual module is composed of spiking\nmixed convolution (SMC), which can effectively extract spatial-spectral\nfeatures. Secondly, this paper designs a simple and efficient arcsine\napproximate derivative (AAD), which solves the non-differentiable problem of\nspike firing by fitting the Dirac function. Through AAD, we can directly train\nsupervised spike neural networks. Finally, this paper conducts comparative\nexperiments with multiple advanced HSI classification algorithms based on\nspiking neural networks on six public hyperspectral data sets. Experimental\nresults show that the AAD function has strong robustness and a good fitting\neffect. Meanwhile, compared with other algorithms, SNN-SWMR requires a time\nstep reduction of about 84%, training time, and testing time reduction of about\n63% and 70% at the same accuracy. This study solves the key problem of SNN\nbased HSI classification algorithms, which has important practical significance\nfor promoting the practical application of HSI classification algorithms in\nedge devices such as spaceborne and airborne devices.\n","authors":["Yang Liu","Yahui Li","Rui Li","Liming Zhou","Lanxue Dang","Huiyu Mu","Qiang Ge"],"pdf_url":"https://arxiv.org/pdf/2409.11619v1.pdf","comment":"15pages,12figures"},{"id":"http://arxiv.org/abs/2210.14416v3","updated":"2024-09-18T00:39:53Z","published":"2022-10-26T01:58:09Z","title":"Residual Back Projection With Untrained Neural Networks","summary":" Background and Objective: The success of neural networks in a number of image\nprocessing tasks has motivated their application in image reconstruction\nproblems in computed tomography (CT). While progress has been made in this\narea, the lack of stability and theoretical guarantees for accuracy, together\nwith the scarcity of high-quality training data for specific imaging domains\npose challenges for many CT applications. In this paper, we present a framework\nfor iterative reconstruction (IR) in CT that leverages the hierarchical\nstructure of neural networks, without the need for training. Our framework\nincorporates this structural information as a deep image prior (DIP), and uses\na novel residual back projection (RBP) connection that forms the basis for our\niterations.\n Methods: We propose using an untrained U-net in conjunction with a novel\nresidual back projection to minimize an objective function and achieve\nhigh-accuracy reconstruction. In each iteration, the weights of the untrained\nU-net are optimized, and the output of the U-net in the current iteration is\nused to update the input of the U-net in the next iteration through the\naforementioned RBP connection.\n Results: Experimental results demonstrate that the RBP-DIP framework offers\nimprovements over other state-of-the-art conventional IR methods, as well as\npre-trained and untrained models with similar network structures under multiple\nconditions. These improvements are particularly significant in the few-view,\nlimited-angle, and low-dose imaging configurations.\n Conclusions: Applying to both parallel and fan beam X-ray imaging, our\nframework shows significant improvement under multiple conditions. Furthermore,\nthe proposed framework requires no training data and can be adjusted on-demand\nto adapt to different conditions (e.g. noise level, geometry, and imaged\nobject).\n","authors":["Ziyu Shu","Alireza Entezari"],"pdf_url":"https://arxiv.org/pdf/2210.14416v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.19493v2","updated":"2024-09-18T00:31:27Z","published":"2024-07-28T13:23:43Z","title":"Official-NV: An LLM-Generated News Video Dataset for Multimodal Fake\n News Detection","summary":" News media, especially video news media, have penetrated into every aspect of\ndaily life, which also brings the risk of fake news. Therefore, multimodal fake\nnews detection has recently garnered increased attention. However, the existing\ndatasets are comprised of user-uploaded videos and contain an excess amounts of\nsuperfluous data, which introduces noise into the model training process. To\naddress this issue, we construct a dataset named Official-NV, comprising\nofficially published news videos. The crawl officially published videos are\naugmented through the use of LLMs-based generation and manual verification,\nthereby expanding the dataset. Furthermore, the proposed dataset is benchmarked\nagainst several baselines to demonstrate its effectiveness in multimodal news\ndetection.\n","authors":["Yihao Wang","Lizhi Chen","Zhong Qian","Peifeng Li"],"pdf_url":"https://arxiv.org/pdf/2407.19493v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.12161v1","updated":"2024-09-18T17:25:31Z","published":"2024-09-18T17:25:31Z","title":"Generalized compression and compressive search of large datasets","summary":" The Big Data explosion has necessitated the development of search algorithms\nthat scale sub-linearly in time and memory.\n While compression algorithms and search algorithms do exist independently,\nfew algorithms offer both, and those which do are domain-specific.\n We present panCAKES, a novel approach to compressive search, i.e., a way to\nperform $k$-NN and $\\rho$-NN search on compressed data while only decompressing\na small, relevant, portion of the data.\n panCAKES assumes the manifold hypothesis and leverages the low-dimensional\nstructure of the data to compress and search it efficiently.\n panCAKES is generic over any distance function for which the distance between\ntwo points is proportional to the memory cost of storing an encoding of one in\nterms of the other.\n This property holds for many widely-used distance functions, e.g. string edit\ndistances (Levenshtein, Needleman-Wunsch, etc.) and set dissimilarity measures\n(Jaccard, Dice, etc.).\n We benchmark panCAKES on a variety of datasets, including genomic, proteomic,\nand set data.\n We compare compression ratios to gzip, and search performance between the\ncompressed and uncompressed versions of the same dataset.\n panCAKES achieves compression ratios close to those of gzip, while offering\nsub-linear time performance for $k$-NN and $\\rho$-NN search.\n We conclude that panCAKES is an efficient, general-purpose algorithm for\nexact compressive search on large datasets that obey the manifold hypothesis.\n We provide an open-source implementation of panCAKES in the Rust programming\nlanguage.\n","authors":["Morgan E. Prior","Thomas Howard III","Emily Light","Najib Ishaq","Noah M. Daniels"],"pdf_url":"https://arxiv.org/pdf/2409.12161v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12150v1","updated":"2024-09-18T17:15:06Z","published":"2024-09-18T17:15:06Z","title":"Decoding Style: Efficient Fine-Tuning of LLMs for Image-Guided Outfit\n Recommendation with Preference","summary":" Personalized outfit recommendation remains a complex challenge, demanding\nboth fashion compatibility understanding and trend awareness. This paper\npresents a novel framework that harnesses the expressive power of large\nlanguage models (LLMs) for this task, mitigating their \"black box\" and static\nnature through fine-tuning and direct feedback integration. We bridge the item\nvisual-textual gap in items descriptions by employing image captioning with a\nMultimodal Large Language Model (MLLM). This enables the LLM to extract style\nand color characteristics from human-curated fashion images, forming the basis\nfor personalized recommendations. The LLM is efficiently fine-tuned on the\nopen-source Polyvore dataset of curated fashion images, optimizing its ability\nto recommend stylish outfits. A direct preference mechanism using negative\nexamples is employed to enhance the LLM's decision-making process. This creates\na self-enhancing AI feedback loop that continuously refines recommendations in\nline with seasonal fashion trends. Our framework is evaluated on the Polyvore\ndataset, demonstrating its effectiveness in two key tasks: fill-in-the-blank,\nand complementary item retrieval. These evaluations underline the framework's\nability to generate stylish, trend-aligned outfit suggestions, continuously\nimproving through direct feedback. The evaluation results demonstrated that our\nproposed framework significantly outperforms the base LLM, creating more\ncohesive outfits. The improved performance in these tasks underscores the\nproposed framework's potential to enhance the shopping experience with accurate\nsuggestions, proving its effectiveness over the vanilla LLM based outfit\ngeneration.\n","authors":["Najmeh Forouzandehmehr","Nima Farrokhsiar","Ramin Giahi","Evren Korpeoglu","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2409.12150v1.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2409.12043v1","updated":"2024-09-18T15:04:12Z","published":"2024-09-18T15:04:12Z","title":"Understanding the Effects of the Baidu-ULTR Logging Policy on Two-Tower\n Models","summary":" Despite the popularity of the two-tower model for unbiased learning to rank\n(ULTR) tasks, recent work suggests that it suffers from a major limitation that\ncould lead to its collapse in industry applications: the problem of logging\npolicy confounding. Several potential solutions have even been proposed;\nhowever, the evaluation of these methods was mostly conducted using\nsemi-synthetic simulation experiments. This paper bridges the gap between\ntheory and practice by investigating the confounding problem on the largest\nreal-world dataset, Baidu-ULTR. Our main contributions are threefold: 1) we\nshow that the conditions for the confounding problem are given on Baidu-ULTR,\n2) the confounding problem bears no significant effect on the two-tower model,\nand 3) we point to a potential mismatch between expert annotations, the golden\nstandard in ULTR, and user click behavior.\n","authors":["Morris de Haan","Philipp Hager"],"pdf_url":"https://arxiv.org/pdf/2409.12043v1.pdf","comment":"Accepted at the CONSEQUENCES '24 workshop, co-located with ACM RecSys\n '24"},{"id":"http://arxiv.org/abs/2409.10576v2","updated":"2024-09-18T13:27:43Z","published":"2024-09-15T15:21:45Z","title":"Language Models and Retrieval Augmented Generation for Automated\n Structured Data Extraction from Diagnostic Reports","summary":" Purpose: To develop and evaluate an automated system for extracting\nstructured clinical information from unstructured radiology and pathology\nreports using open-weights large language models (LMs) and retrieval augmented\ngeneration (RAG), and to assess the effects of model configuration variables on\nextraction performance. Methods and Materials: The study utilized two datasets:\n7,294 radiology reports annotated for Brain Tumor Reporting and Data System\n(BT-RADS) scores and 2,154 pathology reports annotated for isocitrate\ndehydrogenase (IDH) mutation status. An automated pipeline was developed to\nbenchmark the performance of various LMs and RAG configurations. The impact of\nmodel size, quantization, prompting strategies, output formatting, and\ninference parameters was systematically evaluated. Results: The best performing\nmodels achieved over 98% accuracy in extracting BT-RADS scores from radiology\nreports and over 90% for IDH mutation status extraction from pathology reports.\nThe top model being medical fine-tuned llama3. Larger, newer, and domain\nfine-tuned models consistently outperformed older and smaller models. Model\nquantization had minimal impact on performance. Few-shot prompting\nsignificantly improved accuracy. RAG improved performance for complex pathology\nreports but not for shorter radiology reports. Conclusions: Open LMs\ndemonstrate significant potential for automated extraction of structured\nclinical data from unstructured clinical reports with local privacy-preserving\napplication. Careful model selection, prompt engineering, and semi-automated\noptimization using annotated data are critical for optimal performance. These\napproaches could be reliable enough for practical use in research workflows,\nhighlighting the potential for human-machine collaboration in healthcare data\nextraction.\n","authors":["Mohamed Sobhi Jabal","Pranav Warman","Jikai Zhang","Kartikeye Gupta","Ayush Jain","Maciej Mazurowski","Walter Wiggins","Kirti Magudia","Evan Calabrese"],"pdf_url":"https://arxiv.org/pdf/2409.10576v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11905v1","updated":"2024-09-18T12:05:30Z","published":"2024-09-18T12:05:30Z","title":"AlignBot: Aligning VLM-powered Customized Task Planning with User\n Reminders Through Fine-Tuning for Household Robots","summary":" This paper presents AlignBot, a novel framework designed to optimize\nVLM-powered customized task planning for household robots by effectively\naligning with user reminders. In domestic settings, aligning task planning with\nuser reminders poses significant challenges due to the limited quantity,\ndiversity, and multimodal nature of the reminders. To address these challenges,\nAlignBot employs a fine-tuned LLaVA-7B model, functioning as an adapter for\nGPT-4o. This adapter model internalizes diverse forms of user reminders-such as\npersonalized preferences, corrective guidance, and contextual assistance-into\nstructured instruction-formatted cues that prompt GPT-4o in generating\ncustomized task plans. Additionally, AlignBot integrates a dynamic retrieval\nmechanism that selects task-relevant historical successes as prompts for\nGPT-4o, further enhancing task planning accuracy. To validate the effectiveness\nof AlignBot, experiments are conducted in real-world household environments,\nwhich are constructed within the laboratory to replicate typical household\nsettings. A multimodal dataset with over 1,500 entries derived from volunteer\nreminders is used for training and evaluation. The results demonstrate that\nAlignBot significantly improves customized task planning, outperforming\nexisting LLM- and VLM-powered planners by interpreting and aligning with user\nreminders, achieving 86.8% success rate compared to the vanilla GPT-4o baseline\nat 21.6%, reflecting a 65% improvement and over four times greater\neffectiveness. Supplementary materials are available at:\nhttps://yding25.com/AlignBot/\n","authors":[" Zhaxizhuoma","Pengan Chen","Ziniu Wu","Jiawei Sun","Dong Wang","Peng Zhou","Nieqing Cao","Yan Ding","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2409.11905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11860v1","updated":"2024-09-18T10:30:50Z","published":"2024-09-18T10:30:50Z","title":"Retrieve, Annotate, Evaluate, Repeat: Leveraging Multimodal LLMs for\n Large-Scale Product Retrieval Evaluation","summary":" Evaluating production-level retrieval systems at scale is a crucial yet\nchallenging task due to the limited availability of a large pool of\nwell-trained human annotators. Large Language Models (LLMs) have the potential\nto address this scaling issue and offer a viable alternative to humans for the\nbulk of annotation tasks. In this paper, we propose a framework for assessing\nthe product search engines in a large-scale e-commerce setting, leveraging\nMultimodal LLMs for (i) generating tailored annotation guidelines for\nindividual queries, and (ii) conducting the subsequent annotation task. Our\nmethod, validated through deployment on a large e-commerce platform,\ndemonstrates comparable quality to human annotations, significantly reduces\ntime and cost, facilitates rapid problem discovery, and provides an effective\nsolution for production-level quality control at scale.\n","authors":["Kasra Hosseini","Thomas Kober","Josip Krapac","Roland Vollgraf","Weiwei Cheng","Ana Peleteiro Ramallo"],"pdf_url":"https://arxiv.org/pdf/2409.11860v1.pdf","comment":"13 pages, 5 figures, 4 Tables"},{"id":"http://arxiv.org/abs/2212.08841v3","updated":"2024-09-18T09:09:07Z","published":"2022-12-17T10:43:25Z","title":"AugTriever: Unsupervised Dense Retrieval by Scalable Data Augmentation","summary":" Dense retrievers have made significant strides in text retrieval and\nopen-domain question answering. However, most of these achievements have relied\nheavily on extensive human-annotated supervision. In this study, we aim to\ndevelop unsupervised methods for improving dense retrieval models. We propose\ntwo approaches that enable annotation-free and scalable training by creating\npseudo querydocument pairs: query extraction and transferred query generation.\nThe query extraction method involves selecting salient spans from the original\ndocument to generate pseudo queries. On the other hand, the transferred query\ngeneration method utilizes generation models trained for other NLP tasks, such\nas summarization, to produce pseudo queries. Through extensive experimentation,\nwe demonstrate that models trained using these augmentation methods can achieve\ncomparable, if not better, performance than multiple strong dense baselines.\nMoreover, combining these strategies leads to further improvements, resulting\nin superior performance of unsupervised dense retrieval, unsupervised domain\nadaptation and supervised finetuning, benchmarked on both BEIR and ODQA\ndatasets. Code and datasets are publicly available at\nhttps://github.com/salesforce/AugTriever.\n","authors":["Rui Meng","Ye Liu","Semih Yavuz","Divyansh Agarwal","Lifu Tu","Ning Yu","Jianguo Zhang","Meghana Bhat","Yingbo Zhou"],"pdf_url":"https://arxiv.org/pdf/2212.08841v3.pdf","comment":"DCAI24, October 25, 2024, Boise, ID"},{"id":"http://arxiv.org/abs/2409.11798v1","updated":"2024-09-18T08:30:20Z","published":"2024-09-18T08:30:20Z","title":"The Factuality of Large Language Models in the Legal Domain","summary":" This paper investigates the factuality of large language models (LLMs) as\nknowledge bases in the legal domain, in a realistic usage scenario: we allow\nfor acceptable variations in the answer, and let the model abstain from\nanswering when uncertain. First, we design a dataset of diverse factual\nquestions about case law and legislation. We then use the dataset to evaluate\nseveral LLMs under different evaluation methods, including exact, alias, and\nfuzzy matching. Our results show that the performance improves significantly\nunder the alias and fuzzy matching methods. Further, we explore the impact of\nabstaining and in-context examples, finding that both strategies enhance\nprecision. Finally, we demonstrate that additional pre-training on legal\ndocuments, as seen with SaulLM, further improves factual precision from 63% to\n81%.\n","authors":["Rajaa El Hamdani","Thomas Bonald","Fragkiskos Malliaros","Nils Holzenberger","Fabian Suchanek"],"pdf_url":"https://arxiv.org/pdf/2409.11798v1.pdf","comment":"CIKM 2024, short paper"},{"id":"http://arxiv.org/abs/2409.11728v1","updated":"2024-09-18T06:33:11Z","published":"2024-09-18T06:33:11Z","title":"Active Reconfigurable Intelligent Surface Empowered Synthetic Aperture\n Radar Imaging","summary":" Synthetic Aperture Radar (SAR) utilizes the movement of the radar antenna\nover a specific area of interest to achieve higher spatial resolution imaging.\nIn this paper, we aim to investigate the realization of SAR imaging for a\nstationary radar system with the assistance of active reconfigurable\nintelligent surface (ARIS) mounted on an unmanned aerial vehicle (UAV). As the\nUAV moves along the stationary trajectory, the ARIS can not only build a\nhigh-quality virtual line-of-sight (LoS) propagation path, but its mobility can\nalso effectively create a much larger virtual aperture, which can be utilized\nto realize a SAR system. In this paper, we first present a range-Doppler (RD)\nimaging algorithm to obtain imaging results for the proposed ARIS-empowered SAR\nsystem. Then, to further improve the SAR imaging performance, we attempt to\noptimize the reflection coefficients of ARIS to maximize the signal-to-noise\nratio (SNR) at the stationary radar receiver under the constraints of ARIS\nmaximum power and amplification factor. An effective algorithm based on\nfractional programming (FP) and majorization minimization (MM) methods is\ndeveloped to solve the resulting non-convex problem. Simulation results\nvalidate the effectiveness of ARIS-assisted SAR imaging and our proposed RD\nimaging and ARIS optimization algorithms.\n","authors":["Yifan Sun","Rang Liu","Zhiping Lu","Honghao Luo","Ming Li","Qian Liu"],"pdf_url":"https://arxiv.org/pdf/2409.11728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11699v1","updated":"2024-09-18T04:43:41Z","published":"2024-09-18T04:43:41Z","title":"FLARE: Fusing Language Models and Collaborative Architectures for\n Recommender Enhancement","summary":" Hybrid recommender systems, combining item IDs and textual descriptions,\noffer potential for improved accuracy. However, previous work has largely\nfocused on smaller datasets and model architectures. This paper introduces\nFlare (Fusing Language models and collaborative Architectures for Recommender\nEnhancement), a novel hybrid recommender that integrates a language model (mT5)\nwith a collaborative filtering model (Bert4Rec) using a Perceiver network. This\narchitecture allows Flare to effectively combine collaborative and content\ninformation for enhanced recommendations.\n We conduct a two-stage evaluation, first assessing Flare's performance\nagainst established baselines on smaller datasets, where it demonstrates\ncompetitive accuracy. Subsequently, we evaluate Flare on a larger, more\nrealistic dataset with a significantly larger item vocabulary, introducing new\nbaselines for this setting. Finally, we showcase Flare's inherent ability to\nsupport critiquing, enabling users to provide feedback and refine\nrecommendations. We further leverage critiquing as an evaluation method to\nassess the model's language understanding and its transferability to the\nrecommendation task.\n","authors":["Liam Hebert","Marialena Kyriakidi","Hubert Pham","Krishna Sayana","James Pine","Sukhdeep Sodhi","Ambarish Jash"],"pdf_url":"https://arxiv.org/pdf/2409.11699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11695v1","updated":"2024-09-18T04:31:22Z","published":"2024-09-18T04:31:22Z","title":"Basket-Enhanced Heterogenous Hypergraph for Price-Sensitive Next Basket\n Recommendation","summary":" Next Basket Recommendation (NBR) is a new type of recommender system that\npredicts combinations of items users are likely to purchase together. Existing\nNBR models often overlook a crucial factor, which is price, and do not fully\ncapture item-basket-user interactions. To address these limitations, we propose\na novel method called Basket-augmented Dynamic Heterogeneous Hypergraph (BDHH).\nBDHH utilizes a heterogeneous multi-relational graph to capture the intricate\nrelationships among item features, with price as a critical factor. Moreover,\nour approach includes a basket-guided dynamic augmentation network that could\ndynamically enhances item-basket-user interactions. Experiments on real-world\ndatasets demonstrate that BDHH significantly improves recommendation accuracy,\nproviding a more comprehensive understanding of user behavior.\n","authors":["Yuening Zhou","Yulin Wang","Qian Cui","Xinyu Guan","Francisco Cisternas"],"pdf_url":"https://arxiv.org/pdf/2409.11695v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10992v2","updated":"2024-09-18T04:08:44Z","published":"2024-09-17T08:51:02Z","title":"A Best-of-Both Approach to Improve Match Predictions and Reciprocal\n Recommendations for Job Search","summary":" Matching users with mutual preferences is a critical aspect of services\ndriven by reciprocal recommendations, such as job search. To produce\nrecommendations in such scenarios, one can predict match probabilities and\nconstruct rankings based on these predictions. However, this direct match\nprediction approach often underperforms due to the extreme sparsity of match\nlabels. Therefore, most existing methods predict preferences separately for\neach direction (e.g., job seeker to employer and employer to job seeker) and\nthen aggregate the predictions to generate overall matching scores and produce\nrecommendations. However, this typical approach often leads to practical\nissues, such as biased error propagation between the two models. This paper\nintroduces and demonstrates a novel and practical solution to improve\nreciprocal recommendations in production by leveraging pseudo-match scores.\nSpecifically, our approach generates dense and more directly relevant\npseudo-match scores by combining the true match labels, which are accurate but\nsparse, with relatively inaccurate but dense match predictions. We then train a\nmeta-model to output the final match predictions by minimizing the prediction\nloss against the pseudo-match scores. Our method can be seen as a best-of-both\n(BoB) approach, as it combines the high-level ideas of both direct match\nprediction and the two separate models approach. It also allows for\nuser-specific weights to construct personalized pseudo-match scores, achieving\neven better matching performance through appropriate tuning of the weights.\nOffline experiments on real-world job search data demonstrate the superior\nperformance of our BoB method, particularly with personalized pseudo-match\nscores, compared to existing approaches in terms of finding potential matches.\n","authors":["Shuhei Goda","Yudai Hayashi","Yuta Saito"],"pdf_url":"https://arxiv.org/pdf/2409.10992v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11678v1","updated":"2024-09-18T03:34:31Z","published":"2024-09-18T03:34:31Z","title":"An Enhanced-State Reinforcement Learning Algorithm for Multi-Task Fusion\n in Large-Scale Recommender Systems","summary":" As the last key stage of Recommender Systems (RSs), Multi-Task Fusion (MTF)\nis in charge of combining multiple scores predicted by Multi-Task Learning\n(MTL) into a final score to maximize user satisfaction, which decides the\nultimate recommendation results. In recent years, to maximize long-term user\nsatisfaction within a recommendation session, Reinforcement Learning (RL) is\nwidely used for MTF in large-scale RSs. However, limited by their modeling\npattern, all the current RL-MTF methods can only utilize user features as the\nstate to generate actions for each user, but unable to make use of item\nfeatures and other valuable features, which leads to suboptimal results.\nAddressing this problem is a challenge that requires breaking through the\ncurrent modeling pattern of RL-MTF. To solve this problem, we propose a novel\nmethod called Enhanced-State RL for MTF in RSs. Unlike the existing methods\nmentioned above, our method first defines user features, item features, and\nother valuable features collectively as the enhanced state; then proposes a\nnovel actor and critic learning process to utilize the enhanced state to make\nmuch better action for each user-item pair. To the best of our knowledge, this\nnovel modeling pattern is being proposed for the first time in the field of\nRL-MTF. We conduct extensive offline and online experiments in a large-scale\nRS. The results demonstrate that our model outperforms other models\nsignificantly. Enhanced-State RL has been fully deployed in our RS more than\nhalf a year, improving +3.84% user valid consumption and +0.58% user duration\ntime compared to baseline.\n","authors":["Peng Liu","Jiawei Zhu","Cong Xu","Ming Zhao","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.11678v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.17589"},{"id":"http://arxiv.org/abs/2409.11629v1","updated":"2024-09-18T01:23:26Z","published":"2024-09-18T01:23:26Z","title":"Designing Interfaces for Multimodal Vector Search Applications","summary":" Multimodal vector search offers a new paradigm for information retrieval by\nexposing numerous pieces of functionality which are not possible in traditional\nlexical search engines. While multimodal vector search can be treated as a drop\nin replacement for these traditional systems, the experience can be\nsignificantly enhanced by leveraging the unique capabilities of multimodal\nsearch. Central to any information retrieval system is a user who expresses an\ninformation need, traditional user interfaces with a single search bar allow\nusers to interact with lexical search systems effectively however are not\nnecessarily optimal for multimodal vector search. In this paper we explore\nnovel capabilities of multimodal vector search applications utilising CLIP\nmodels and present implementations and design patterns which better allow users\nto express their information needs and effectively interact with these systems\nin an information retrieval context.\n","authors":["Owen Pendrigh Elliott","Tom Hamer","Jesse Clark"],"pdf_url":"https://arxiv.org/pdf/2409.11629v1.pdf","comment":"12 pages, 8 figures, CIKM 2024 MMSR Workshop"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.12192v1","updated":"2024-09-18T17:59:43Z","published":"2024-09-18T17:59:43Z","title":"DynaMo: In-Domain Dynamics Pretraining for Visuo-Motor Control","summary":" Imitation learning has proven to be a powerful tool for training complex\nvisuomotor policies. However, current methods often require hundreds to\nthousands of expert demonstrations to handle high-dimensional visual\nobservations. A key reason for this poor data efficiency is that visual\nrepresentations are predominantly either pretrained on out-of-domain data or\ntrained directly through a behavior cloning objective. In this work, we present\nDynaMo, a new in-domain, self-supervised method for learning visual\nrepresentations. Given a set of expert demonstrations, we jointly learn a\nlatent inverse dynamics model and a forward dynamics model over a sequence of\nimage embeddings, predicting the next frame in latent space, without\naugmentations, contrastive sampling, or access to ground truth actions.\nImportantly, DynaMo does not require any out-of-domain data such as Internet\ndatasets or cross-embodied datasets. On a suite of six simulated and real\nenvironments, we show that representations learned with DynaMo significantly\nimprove downstream imitation learning performance over prior self-supervised\nlearning objectives, and pretrained representations. Gains from using DynaMo\nhold across policy classes such as Behavior Transformer, Diffusion Policy, MLP,\nand nearest neighbors. Finally, we ablate over key components of DynaMo and\nmeasure its impact on downstream policy performance. Robot videos are best\nviewed at https://dynamo-ssl.github.io\n","authors":["Zichen Jeff Cui","Hengkai Pan","Aadhithya Iyer","Siddhant Haldar","Lerrel Pinto"],"pdf_url":"https://arxiv.org/pdf/2409.12192v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12189v1","updated":"2024-09-18T17:58:51Z","published":"2024-09-18T17:58:51Z","title":"Massively Multi-Person 3D Human Motion Forecasting with Scene Context","summary":" Forecasting long-term 3D human motion is challenging: the stochasticity of\nhuman behavior makes it hard to generate realistic human motion from the input\nsequence alone. Information on the scene environment and the motion of nearby\npeople can greatly aid the generation process. We propose a scene-aware social\ntransformer model (SAST) to forecast long-term (10s) human motion motion.\nUnlike previous models, our approach can model interactions between both widely\nvarying numbers of people and objects in a scene. We combine a temporal\nconvolutional encoder-decoder architecture with a Transformer-based bottleneck\nthat allows us to efficiently combine motion and scene information. We model\nthe conditional motion distribution using denoising diffusion models. We\nbenchmark our approach on the Humans in Kitchens dataset, which contains 1 to\n16 persons and 29 to 50 objects that are visible simultaneously. Our model\noutperforms other approaches in terms of realism and diversity on different\nmetrics and in a user study. Code is available at\nhttps://github.com/felixbmuller/SAST.\n","authors":["Felix B Mueller","Julian Tanke","Juergen Gall"],"pdf_url":"https://arxiv.org/pdf/2409.12189v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.12183v1","updated":"2024-09-18T17:55:00Z","published":"2024-09-18T17:55:00Z","title":"To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic\n reasoning","summary":" Chain-of-thought (CoT) via prompting is the de facto method for eliciting\nreasoning capabilities from large language models (LLMs). But for what kinds of\ntasks is this extra ``thinking'' really helpful? To analyze this, we conducted\na quantitative meta-analysis covering over 100 papers using CoT and ran our own\nevaluations of 20 datasets across 14 models. Our results show that CoT gives\nstrong performance benefits primarily on tasks involving math or logic, with\nmuch smaller gains on other types of tasks. On MMLU, directly generating the\nanswer without CoT leads to almost identical accuracy as CoT unless the\nquestion or model's response contains an equals sign, indicating symbolic\noperations and reasoning. Following this finding, we analyze the behavior of\nCoT on these problems by separating planning and execution and comparing\nagainst tool-augmented LLMs. Much of CoT's gain comes from improving symbolic\nexecution, but it underperforms relative to using a symbolic solver. Our\nresults indicate that CoT can be applied selectively, maintaining performance\nwhile saving inference costs. Furthermore, they suggest a need to move beyond\nprompt-based CoT to new paradigms that better leverage intermediate computation\nacross the whole range of LLM applications.\n","authors":["Zayne Sprague","Fangcong Yin","Juan Diego Rodriguez","Dongwei Jiang","Manya Wadhwa","Prasann Singhal","Xinyu Zhao","Xi Ye","Kyle Mahowald","Greg Durrett"],"pdf_url":"https://arxiv.org/pdf/2409.12183v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12180v1","updated":"2024-09-18T17:52:53Z","published":"2024-09-18T17:52:53Z","title":"Finetuning Language Models to Emit Linguistic Expressions of Uncertainty","summary":" Large language models (LLMs) are increasingly employed in information-seeking\nand decision-making tasks. Despite their broad utility, LLMs tend to generate\ninformation that conflicts with real-world facts, and their persuasive style\ncan make these inaccuracies appear confident and convincing. As a result,\nend-users struggle to consistently align the confidence expressed by LLMs with\nthe accuracy of their predictions, often leading to either blind trust in all\noutputs or a complete disregard for their reliability. In this work, we explore\nsupervised finetuning on uncertainty-augmented predictions as a method to\ndevelop models that produce linguistic expressions of uncertainty.\nSpecifically, we measure the calibration of pre-trained models and then\nfine-tune language models to generate calibrated linguistic expressions of\nuncertainty. Through experiments on various question-answering datasets, we\ndemonstrate that LLMs are well-calibrated in assessing their predictions, and\nsupervised finetuning based on the model's own confidence leads to\nwell-calibrated expressions of uncertainty, particularly for single-claim\nanswers.\n","authors":["Arslan Chaudhry","Sridhar Thiagarajan","Dilan Gorur"],"pdf_url":"https://arxiv.org/pdf/2409.12180v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.11200v2","updated":"2024-09-18T17:46:11Z","published":"2024-08-20T21:20:38Z","title":"UKAN: Unbound Kolmogorov-Arnold Network Accompanied with Accelerated\n Library","summary":" In this work, we present a GPU-accelerated library for the underlying\ncomponents of Kolmogorov-Arnold Networks (KANs), along with an algorithm to\neliminate bounded grids in KANs. The GPU-accelerated library reduces the\ncomputational complexity of Basis Spline (B-spline) evaluation by a factor of\n$\\mathcal{O}$(grid size) compared to existing codes, enabling batch computation\nfor large-scale learning. To overcome the limitations of traditional KANs, we\nintroduce Unbounded KANs (UKANs), which eliminate the need for a bounded grid\nand a fixed number of B-spline coefficients. To do so, we replace the KAN\nparameters (B-spline coefficients) with a coefficient generator (CG) model. The\ninputs to the CG model are designed based on the idea of an infinite symmetric\ngrid extending from negative infinity to positive infinity. The positional\nencoding of grid group, a sequential collection of B-spline grid indexes, is\nfed into the CG model, and coefficients are consumed by the efficient\nimplementation (matrix representations) of B-spline functions to generate\noutputs. We perform several experiments on regression, classification, and\ngenerative tasks, which are promising. In particular, UKAN does not require\ndata normalization or a bounded domain for evaluation. Additionally, our\nbenchmarking results indicate the superior memory and computational efficiency\nof our library compared to existing codes.\n","authors":["Alireza Moradzadeh","Lukasz Wawrzyniak","Miles Macklin","Saee G. Paliwal"],"pdf_url":"https://arxiv.org/pdf/2408.11200v2.pdf","comment":"10 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.16810v3","updated":"2024-09-18T17:45:20Z","published":"2023-12-28T04:00:25Z","title":"Machine Learning Approaches for Diagnostics and Prognostics of\n Industrial Systems Using Open Source Data from PHM Data Challenges: A Review","summary":" In the field of Prognostics and Health Management (PHM), recent years have\nwitnessed a significant surge in the application of machine learning (ML).\nDespite this growth, the field grapples with a lack of unified guidelines and\nsystematic approaches for effectively implementing these ML techniques and\ncomprehensive analysis regarding industrial open-source data across varied\nscenarios. To address these gaps, this paper provides a comprehensive review of\nML approaches for diagnostics and prognostics of industrial systems using\nopen-source datasets from PHM Data Challenge Competitions held between 2018 and\n2023 by PHM Society and IEEE Reliability Society and summarizes a unified ML\nframework. This review systematically categorizes and scrutinizes the problems,\nchallenges, methodologies, and advancements demonstrated in these competitions,\nhighlighting the evolving role of both conventional machine learning and deep\nlearning in tackling complex industrial tasks related to detection, diagnosis,\nassessment, and prognosis. Moreover, this paper delves into the common\nchallenges in PHM data challenge competitions by emphasizing data-related and\nmodel-related issues and evaluating the limitations of these competitions. The\npotential solutions to address these challenges are also summarized. Finally,\nwe identify key themes and potential directions for future research, providing\nopportunities and prospects for next-generation ML-PHM development in PHM\ndomain.\n","authors":["Hanqi Su","Jay Lee"],"pdf_url":"https://arxiv.org/pdf/2312.16810v3.pdf","comment":"The paper submitted to the International Journal of Prognostics and\n Health Management (IJPHM) has been accepted"},{"id":"http://arxiv.org/abs/2407.20209v2","updated":"2024-09-18T17:44:48Z","published":"2024-07-29T17:40:04Z","title":"Characterizing Dynamical Stability of Stochastic Gradient Descent in\n Overparameterized Learning","summary":" For overparameterized optimization tasks, such as the ones found in modern\nmachine learning, global minima are generally not unique. In order to\nunderstand generalization in these settings, it is vital to study to which\nminimum an optimization algorithm converges. The possibility of having minima\nthat are unstable under the dynamics imposed by the optimization algorithm\nlimits the potential minima that the algorithm can find. In this paper, we\ncharacterize the global minima that are dynamically stable/unstable for both\ndeterministic and stochastic gradient descent (SGD). In particular, we\nintroduce a characteristic Lyapunov exponent which depends on the local\ndynamics around a global minimum and rigorously prove that the sign of this\nLyapunov exponent determines whether SGD can accumulate at the respective\nglobal minimum.\n","authors":["Dennis Chemnitz","Maximilian Engel"],"pdf_url":"https://arxiv.org/pdf/2407.20209v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10289v2","updated":"2024-09-18T17:30:50Z","published":"2024-09-16T13:56:17Z","title":"ReflectDiffu:Reflect between Emotion-intent Contagion and Mimicry for\n Empathetic Response Generation via a RL-Diffusion Framework","summary":" Empathetic response generation necessitates the integration of emotional and\nintentional dynamics to foster meaningful interactions. Existing research\neither neglects the intricate interplay between emotion and intent, leading to\nsuboptimal controllability of empathy, or resorts to large language models\n(LLMs), which incur significant computational overhead. In this paper, we\nintroduce ReflectDiffu, a lightweight and comprehensive framework for\nempathetic response generation. This framework incorporates emotion contagion\nto augment emotional expressiveness and employs an emotion-reasoning mask to\npinpoint critical emotional elements. Additionally, it integrates intent\nmimicry within reinforcement learning for refinement during diffusion. By\nharnessing an intent twice reflect the mechanism of\nExploring-Sampling-Correcting, ReflectDiffu adeptly translates emotional\ndecision-making into precise intent actions, thereby addressing empathetic\nresponse misalignments stemming from emotional misrecognition. Through\nreflection, the framework maps emotional states to intents, markedly enhancing\nboth response empathy and flexibility. Comprehensive experiments reveal that\nReflectDiffu outperforms existing models regarding relevance, controllability,\nand informativeness, achieving state-of-the-art results in both automatic and\nhuman evaluations.\n","authors":["Jiahao Yuan","Zixiang Di","Zhiqing Cui","Guisong Yang","Usman Naseem"],"pdf_url":"https://arxiv.org/pdf/2409.10289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.02762v2","updated":"2024-09-18T17:28:24Z","published":"2024-05-04T21:55:33Z","title":"TK-Planes: Tiered K-Planes with High Dimensional Feature Vectors for\n Dynamic UAV-based Scenes","summary":" In this paper, we present a new approach to bridge the domain gap between\nsynthetic and real-world data for unmanned aerial vehicle (UAV)-based\nperception. Our formulation is designed for dynamic scenes, consisting of small\nmoving objects or human actions. We propose an extension of K-Planes Neural\nRadiance Field (NeRF), wherein our algorithm stores a set of tiered feature\nvectors. The tiered feature vectors are generated to effectively model\nconceptual information about a scene as well as an image decoder that\ntransforms output feature maps into RGB images. Our technique leverages the\ninformation amongst both static and dynamic objects within a scene and is able\nto capture salient scene attributes of high altitude videos. We evaluate its\nperformance on challenging datasets, including Okutama Action and UG2, and\nobserve considerable improvement in accuracy over state of the art neural\nrendering methods.\n","authors":["Christopher Maxey","Jaehoon Choi","Yonghan Lee","Hyungtae Lee","Dinesh Manocha","Heesung Kwon"],"pdf_url":"https://arxiv.org/pdf/2405.02762v2.pdf","comment":"8 pages, submitted to ICRA2025"},{"id":"http://arxiv.org/abs/2409.12150v1","updated":"2024-09-18T17:15:06Z","published":"2024-09-18T17:15:06Z","title":"Decoding Style: Efficient Fine-Tuning of LLMs for Image-Guided Outfit\n Recommendation with Preference","summary":" Personalized outfit recommendation remains a complex challenge, demanding\nboth fashion compatibility understanding and trend awareness. This paper\npresents a novel framework that harnesses the expressive power of large\nlanguage models (LLMs) for this task, mitigating their \"black box\" and static\nnature through fine-tuning and direct feedback integration. We bridge the item\nvisual-textual gap in items descriptions by employing image captioning with a\nMultimodal Large Language Model (MLLM). This enables the LLM to extract style\nand color characteristics from human-curated fashion images, forming the basis\nfor personalized recommendations. The LLM is efficiently fine-tuned on the\nopen-source Polyvore dataset of curated fashion images, optimizing its ability\nto recommend stylish outfits. A direct preference mechanism using negative\nexamples is employed to enhance the LLM's decision-making process. This creates\na self-enhancing AI feedback loop that continuously refines recommendations in\nline with seasonal fashion trends. Our framework is evaluated on the Polyvore\ndataset, demonstrating its effectiveness in two key tasks: fill-in-the-blank,\nand complementary item retrieval. These evaluations underline the framework's\nability to generate stylish, trend-aligned outfit suggestions, continuously\nimproving through direct feedback. The evaluation results demonstrated that our\nproposed framework significantly outperforms the base LLM, creating more\ncohesive outfits. The improved performance in these tasks underscores the\nproposed framework's potential to enhance the shopping experience with accurate\nsuggestions, proving its effectiveness over the vanilla LLM based outfit\ngeneration.\n","authors":["Najmeh Forouzandehmehr","Nima Farrokhsiar","Ramin Giahi","Evren Korpeoglu","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2409.12150v1.pdf","comment":"CIKM 2024"},{"id":"http://arxiv.org/abs/2409.12136v1","updated":"2024-09-18T17:00:20Z","published":"2024-09-18T17:00:20Z","title":"GRIN: GRadient-INformed MoE","summary":" Mixture-of-Experts (MoE) models scale more effectively than dense models due\nto sparse computation through expert routing, selectively activating only a\nsmall subset of expert modules. However, sparse computation challenges\ntraditional training practices, as discrete expert routing hinders standard\nbackpropagation and thus gradient-based optimization, which are the cornerstone\nof deep learning. To better pursue the scaling power of MoE, we introduce GRIN\n(GRadient-INformed MoE training), which incorporates sparse gradient estimation\nfor expert routing and configures model parallelism to avoid token dropping.\nApplying GRIN to autoregressive language modeling, we develop a top-2\n16$\\times$3.8B MoE model. Our model, with only 6.6B activated parameters,\noutperforms a 7B dense model and matches the performance of a 14B dense model\ntrained on the same data. Extensive evaluations across diverse tasks\ndemonstrate the potential of GRIN to significantly enhance MoE efficacy,\nachieving 79.4 on MMLU, 83.7 on HellaSwag, 74.4 on HumanEval, and 58.9 on MATH.\n","authors":["Liyuan Liu","Young Jin Kim","Shuohang Wang","Chen Liang","Yelong Shen","Hao Cheng","Xiaodong Liu","Masahiro Tanaka","Xiaoxia Wu","Wenxiang Hu","Vishrav Chaudhary","Zeqi Lin","Chenruidong Zhang","Jilong Xue","Hany Awadalla","Jianfeng Gao","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2409.12136v1.pdf","comment":"58 pages"},{"id":"http://arxiv.org/abs/2409.12135v1","updated":"2024-09-18T16:59:17Z","published":"2024-09-18T16:59:17Z","title":"Almost Sure Convergence of Linear Temporal Difference Learning with\n Arbitrary Features","summary":" Temporal difference (TD) learning with linear function approximation,\nabbreviated as linear TD, is a classic and powerful prediction algorithm in\nreinforcement learning. While it is well understood that linear TD converges\nalmost surely to a unique point, this convergence traditionally requires the\nassumption that the features used by the approximator are linearly independent.\nHowever, this linear independence assumption does not hold in many practical\nscenarios. This work is the first to establish the almost sure convergence of\nlinear TD without requiring linearly independent features. In fact, we do not\nmake any assumptions on the features. We prove that the approximated value\nfunction converges to a unique point and the weight iterates converge to a set.\nWe also establish a notion of local stability of the weight iterates.\nImportantly, we do not need to introduce any other additional assumptions and\ndo not need to make any modification to the linear TD algorithm. Key to our\nanalysis is a novel characterization of bounded invariant sets of the mean ODE\nof linear TD.\n","authors":["Jiuqi Wang","Shangtong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.12135v1.pdf","comment":"30 pages, 0 figures"},{"id":"http://arxiv.org/abs/2409.12122v1","updated":"2024-09-18T16:45:37Z","published":"2024-09-18T16:45:37Z","title":"Qwen2.5-Math Technical Report: Toward Mathematical Expert Model via\n Self-Improvement","summary":" In this report, we present a series of math-specific large language models:\nQwen2.5-Math and Qwen2.5-Math-Instruct-1.5B/7B/72B. The core innovation of the\nQwen2.5 series lies in integrating the philosophy of self-improvement\nthroughout the entire pipeline, from pre-training and post-training to\ninference: (1) During the pre-training phase, Qwen2-Math-Instruct is utilized\nto generate large-scale, high-quality mathematical data. (2) In the\npost-training phase, we develop a reward model (RM) by conducting massive\nsampling from Qwen2-Math-Instruct. This RM is then applied to the iterative\nevolution of data in supervised fine-tuning (SFT). With a stronger SFT model,\nit's possible to iteratively train and update the RM, which in turn guides the\nnext round of SFT data iteration. On the final SFT model, we employ the\nultimate RM for reinforcement learning, resulting in the Qwen2.5-Math-Instruct.\n(3) Furthermore, during the inference stage, the RM is used to guide sampling,\noptimizing the model's performance.\n Qwen2.5-Math-Instruct supports both Chinese and English, and possess advanced\nmathematical reasoning capabilities, including Chain-of-Thought (CoT) and\nTool-Integrated Reasoning (TIR). We evaluate our models on 10 mathematics\ndatasets in both English and Chinese, such as GSM8K, MATH, GaoKao, AMC23, and\nAIME24, covering a range of difficulties from grade school level to math\ncompetition problems.\n","authors":["An Yang","Beichen Zhang","Binyuan Hui","Bofei Gao","Bowen Yu","Chengpeng Li","Dayiheng Liu","Jianhong Tu","Jingren Zhou","Junyang Lin","Keming Lu","Mingfeng Xue","Runji Lin","Tianyu Liu","Xingzhang Ren","Zhenru Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.12122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12116v1","updated":"2024-09-18T16:38:37Z","published":"2024-09-18T16:38:37Z","title":"Stronger Baseline Models -- A Key Requirement for Aligning Machine\n Learning Research with Clinical Utility","summary":" Machine Learning (ML) research has increased substantially in recent years,\ndue to the success of predictive modeling across diverse application domains.\nHowever, well-known barriers exist when attempting to deploy ML models in\nhigh-stakes, clinical settings, including lack of model transparency (or the\ninability to audit the inference process), large training data requirements\nwith siloed data sources, and complicated metrics for measuring model utility.\nIn this work, we show empirically that including stronger baseline models in\nhealthcare ML evaluations has important downstream effects that aid\npractitioners in addressing these challenges. Through a series of case studies,\nwe find that the common practice of omitting baselines or comparing against a\nweak baseline model (e.g. a linear model with no optimization) obscures the\nvalue of ML methods proposed in the research literature. Using these insights,\nwe propose some best practices that will enable practitioners to more\neffectively study and deploy ML models in clinical settings.\n","authors":["Nathan Wolfrath","Joel Wolfrath","Hengrui Hu","Anjishnu Banerjee","Anai N. Kothari"],"pdf_url":"https://arxiv.org/pdf/2409.12116v1.pdf","comment":"18 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.12112v1","updated":"2024-09-18T16:31:19Z","published":"2024-09-18T16:31:19Z","title":"Pareto Data Framework: Steps Towards Resource-Efficient Decision Making\n Using Minimum Viable Data (MVD)","summary":" This paper introduces the Pareto Data Framework, an approach for identifying\nand selecting the Minimum Viable Data (MVD) required for enabling machine\nlearning applications on constrained platforms such as embedded systems, mobile\ndevices, and Internet of Things (IoT) devices. We demonstrate that strategic\ndata reduction can maintain high performance while significantly reducing\nbandwidth, energy, computation, and storage costs. The framework identifies\nMinimum Viable Data (MVD) to optimize efficiency across resource-constrained\nenvironments without sacrificing performance. It addresses common inefficient\npractices in an IoT application such as overprovisioning of sensors and\noverprecision, and oversampling of signals, proposing scalable solutions for\noptimal sensor selection, signal extraction and transmission, and data\nrepresentation. An experimental methodology demonstrates effective acoustic\ndata characterization after downsampling, quantization, and truncation to\nsimulate reduced-fidelity sensors and network and storage constraints; results\nshows that performance can be maintained up to 95\\% with sample rates reduced\nby 75\\% and bit depths and clip length reduced by 50\\% which translates into\nsubstantial cost and resource reduction. These findings have implications on\nthe design and development of constrained systems. The paper also discusses\nbroader implications of the framework, including the potential to democratize\nadvanced AI technologies across IoT applications and sectors such as\nagriculture, transportation, and manufacturing to improve access and multiply\nthe benefits of data-driven insights.\n","authors":["Tashfain Ahmed","Josh Siegel"],"pdf_url":"https://arxiv.org/pdf/2409.12112v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.12367v2","updated":"2024-09-18T16:30:21Z","published":"2024-04-18T17:50:15Z","title":"Model-free quantification of completeness, uncertainties, and outliers\n in atomistic machine learning using information theory","summary":" An accurate description of information is relevant for a range of problems in\natomistic machine learning (ML), such as crafting training sets, performing\nuncertainty quantification (UQ), or extracting physical insights from large\ndatasets. However, atomistic ML often relies on unsupervised learning or model\npredictions to analyze information contents from simulation or training data.\nHere, we introduce a theoretical framework that provides a rigorous, model-free\ntool to quantify information contents in atomistic simulations. We demonstrate\nthat the information entropy of a distribution of atom-centered environments\nexplains known heuristics in ML potential developments, from training set sizes\nto dataset optimality. Using this tool, we propose a model-free UQ method that\nreliably predicts epistemic uncertainty and detects out-of-distribution\nsamples, including rare events in systems such as nucleation. This method\nprovides a general tool for data-driven atomistic modeling and combines efforts\nin ML, simulations, and physical explainability.\n","authors":["Daniel Schwalbe-Koda","Sebastien Hamel","Babak Sadigh","Fei Zhou","Vincenzo Lordi"],"pdf_url":"https://arxiv.org/pdf/2404.12367v2.pdf","comment":"v2.0"},{"id":"http://arxiv.org/abs/2409.12105v1","updated":"2024-09-18T16:25:29Z","published":"2024-09-18T16:25:29Z","title":"FedLF: Adaptive Logit Adjustment and Feature Optimization in Federated\n Long-Tailed Learning","summary":" Federated learning offers a paradigm to the challenge of preserving privacy\nin distributed machine learning. However, datasets distributed across each\nclient in the real world are inevitably heterogeneous, and if the datasets can\nbe globally aggregated, they tend to be long-tailed distributed, which greatly\naffects the performance of the model. The traditional approach to federated\nlearning primarily addresses the heterogeneity of data among clients, yet it\nfails to address the phenomenon of class-wise bias in global long-tailed data.\nThis results in the trained model focusing on the head classes while neglecting\nthe equally important tail classes. Consequently, it is essential to develop a\nmethodology that considers classes holistically. To address the above problems,\nwe propose a new method FedLF, which introduces three modifications in the\nlocal training phase: adaptive logit adjustment, continuous class centred\noptimization, and feature decorrelation. We compare seven state-of-the-art\nmethods with varying degrees of data heterogeneity and long-tailed\ndistribution. Extensive experiments on benchmark datasets CIFAR-10-LT and\nCIFAR-100-LT demonstrate that our approach effectively mitigates the problem of\nmodel performance degradation due to data heterogeneity and long-tailed\ndistribution. our code is available at https://github.com/18sym/FedLF.\n","authors":["Xiuhua Lu","Peng Li","Xuefeng Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.12105v1.pdf","comment":"Accepted by ACML 2024"},{"id":"http://arxiv.org/abs/2409.12100v1","updated":"2024-09-18T16:20:57Z","published":"2024-09-18T16:20:57Z","title":"Symmetry-Enriched Learning: A Category-Theoretic Framework for Robust\n Machine Learning Models","summary":" This manuscript presents a novel framework that integrates higher-order\nsymmetries and category theory into machine learning. We introduce new\nmathematical constructs, including hyper-symmetry categories and functorial\nrepresentations, to model complex transformations within learning algorithms.\nOur contributions include the design of symmetry-enriched learning models, the\ndevelopment of advanced optimization techniques leveraging categorical\nsymmetries, and the theoretical analysis of their implications for model\nrobustness, generalization, and convergence. Through rigorous proofs and\npractical applications, we demonstrate that incorporating higher-dimensional\ncategorical structures enhances both the theoretical foundations and practical\ncapabilities of modern machine learning algorithms, opening new directions for\nresearch and innovation.\n","authors":["Ronald Katende"],"pdf_url":"https://arxiv.org/pdf/2409.12100v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13318v2","updated":"2024-09-18T16:09:49Z","published":"2024-04-20T08:23:46Z","title":"EHRFL: Federated Learning Framework for Institution-Specific Model\n Construction using Electronic Health Records","summary":" The increasing volume of electronic health records (EHRs) across healthcare\ninstitutions presents the opportunity to enhance model accuracy and robustness\nin clinical prediction tasks. Federated learning enables training on data from\nmultiple institutions while preserving patient privacy and complying to\nregulatory constraints. However, most federated learning research focuses on\nconstructing a global model for multiple clients, overlooking the practical\nneed for institution-specific models. In this work, we introduce EHRFL, a\nfederated learning framework using EHRs designed to develop a model tailored to\na single healthcare institution. Our framework addresses two key challenges:\n(1) enabling federated learning across institutions with heterogeneous EHR\nsystems using text-based EHR modeling, and (2) reducing the costs associated\nwith federated learning by selecting suitable participating clients using\naveraged patient embeddings, which enables optimizing the number of\nparticipants without compromising model performance for the institution. Our\nexperiment results on multiple open-source EHR datasets demonstrate the\neffectiveness of EHRFL in addressing the two challenges, establishing it as a\npractical solution for institution-specific model development in federated\nlearning.\n","authors":["Jiyoun Kim","Junu Kim","Kyunghoon Hur","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2404.13318v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12087v1","updated":"2024-09-18T16:03:57Z","published":"2024-09-18T16:03:57Z","title":"Towards Interpretable End-Stage Renal Disease (ESRD) Prediction:\n Utilizing Administrative Claims Data with Explainable AI Techniques","summary":" This study explores the potential of utilizing administrative claims data,\ncombined with advanced machine learning and deep learning techniques, to\npredict the progression of Chronic Kidney Disease (CKD) to End-Stage Renal\nDisease (ESRD). We analyze a comprehensive, 10-year dataset provided by a major\nhealth insurance organization to develop prediction models for multiple\nobservation windows using traditional machine learning methods such as Random\nForest and XGBoost as well as deep learning approaches such as Long Short-Term\nMemory (LSTM) networks. Our findings demonstrate that the LSTM model,\nparticularly with a 24-month observation window, exhibits superior performance\nin predicting ESRD progression, outperforming existing models in the\nliterature. We further apply SHapley Additive exPlanations (SHAP) analysis to\nenhance interpretability, providing insights into the impact of individual\nfeatures on predictions at the individual patient level. This study underscores\nthe value of leveraging administrative claims data for CKD management and\npredicting ESRD progression.\n","authors":["Yubo Li","Saba Al-Sayouri","Rema Padman"],"pdf_url":"https://arxiv.org/pdf/2409.12087v1.pdf","comment":"10pages, 4 figures, AMIA 2024"},{"id":"http://arxiv.org/abs/2409.12078v1","updated":"2024-09-18T15:53:45Z","published":"2024-09-18T15:53:45Z","title":"Denoising diffusion models for high-resolution microscopy image\n restoration","summary":" Advances in microscopy imaging enable researchers to visualize structures at\nthe nanoscale level thereby unraveling intricate details of biological\norganization. However, challenges such as image noise, photobleaching of\nfluorophores, and low tolerability of biological samples to high light doses\nremain, restricting temporal resolutions and experiment durations. Reduced\nlaser doses enable longer measurements at the cost of lower resolution and\nincreased noise, which hinders accurate downstream analyses. Here we train a\ndenoising diffusion probabilistic model (DDPM) to predict high-resolution\nimages by conditioning the model on low-resolution information. Additionally,\nthe probabilistic aspect of the DDPM allows for repeated generation of images\nthat tend to further increase the signal-to-noise ratio. We show that our model\nachieves a performance that is better or similar to the previously\nbest-performing methods, across four highly diverse datasets. Importantly,\nwhile any of the previous methods show competitive performance for some, but\nnot all datasets, our method consistently achieves high performance across all\nfour data sets, suggesting high generalizability.\n","authors":["Pamela Osuna-Vargas","Maren H. Wehrheim","Lucas Zinz","Johanna Rahm","Ashwin Balakrishnan","Alexandra Kaminer","Mike Heilemann","Matthias Kaschube"],"pdf_url":"https://arxiv.org/pdf/2409.12078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12076v1","updated":"2024-09-18T15:48:59Z","published":"2024-09-18T15:48:59Z","title":"Unsupervised Domain Adaptation Via Data Pruning","summary":" The removal of carefully-selected examples from training data has recently\nemerged as an effective way of improving the robustness of machine learning\nmodels. However, the best way to select these examples remains an open\nquestion. In this paper, we consider the problem from the perspective of\nunsupervised domain adaptation (UDA). We propose AdaPrune, a method for UDA\nwhereby training examples are removed to attempt to align the training\ndistribution to that of the target data. By adopting the maximum mean\ndiscrepancy (MMD) as the criterion for alignment, the problem can be neatly\nformulated and solved as an integer quadratic program. We evaluate our approach\non a real-world domain shift task of bioacoustic event detection. As a method\nfor UDA, we show that AdaPrune outperforms related techniques, and is\ncomplementary to other UDA algorithms such as CORAL. Our analysis of the\nrelationship between the MMD and model accuracy, along with t-SNE plots,\nvalidate the proposed method as a principled and well-founded way of performing\ndata pruning.\n","authors":["Andrea Napoli","Paul White"],"pdf_url":"https://arxiv.org/pdf/2409.12076v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16956v2","updated":"2024-09-18T15:47:10Z","published":"2023-11-28T17:03:56Z","title":"Adaptive Step Sizes for Preconditioned Stochastic Gradient Descent","summary":" This paper proposes a novel approach to adaptive step sizes in stochastic\ngradient descent (SGD) by utilizing quantities that we have identified as\nnumerically traceable -- the Lipschitz constant for gradients and a concept of\nthe local variance in search directions. Our findings yield a nearly\nhyperparameter-free algorithm for stochastic optimization, which has provable\nconvergence properties and exhibits truly problem adaptive behavior on\nclassical image classification tasks. Our framework is set in a general Hilbert\nspace and thus enables the potential inclusion of a preconditioner through the\nchoice of the inner product.\n","authors":["Frederik Köhne","Leonie Kreis","Anton Schiela","Roland Herzog"],"pdf_url":"https://arxiv.org/pdf/2311.16956v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12067v1","updated":"2024-09-18T15:39:12Z","published":"2024-09-18T15:39:12Z","title":"Fitting Multilevel Factor Models","summary":" We examine a special case of the multilevel factor model, with covariance\ngiven by multilevel low rank (MLR) matrix~\\cite{parshakova2023factor}. We\ndevelop a novel, fast implementation of the expectation-maximization (EM)\nalgorithm, tailored for multilevel factor models, to maximize the likelihood of\nthe observed data. This method accommodates any hierarchical structure and\nmaintains linear time and storage complexities per iteration. This is achieved\nthrough a new efficient technique for computing the inverse of the positive\ndefinite MLR matrix. We show that the inverse of an invertible PSD MLR matrix\nis also an MLR matrix with the same sparsity in factors, and we use the\nrecursive Sherman-Morrison-Woodbury matrix identity to obtain the factors of\nthe inverse. Additionally, we present an algorithm that computes the Cholesky\nfactorization of an expanded matrix with linear time and space complexities,\nyielding the covariance matrix as its Schur complement. This paper is\naccompanied by an open-source package that implements the proposed methods.\n","authors":["Tetiana Parshakova","Trevor Hastie","Stephen Boyd"],"pdf_url":"https://arxiv.org/pdf/2409.12067v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18779v2","updated":"2024-09-18T15:36:39Z","published":"2023-05-30T06:24:30Z","title":"A geometric view on probabilistically robust learning","summary":" Although deep neural networks have achieved super-human performance on many\nclassification tasks, they often exhibit a worrying lack of robustness towards\nadversarially generated examples. Thus, considerable effort has been invested\ninto reformulating standard Risk Minimization (RM) into an adversarially robust\nframework. Recently, attention has shifted towards approaches which interpolate\nbetween the robustness offered by adversarial training and the higher clean\naccuracy and faster training times of RM. In this paper, we take a fresh and\ngeometric view on one such method -- Probabilistically Robust Learning (PRL).\nWe propose a mathematical framework for understanding PRL, which allows us to\nidentify geometric pathologies in its original formulation and to introduce a\nfamily of probabilistic nonlocal perimeter functionals to rectify them. We\nprove existence of solutions to the original and modified problems using novel\nrelaxation methods and also study properties, as well as local limits, of the\nintroduced perimeters. We also clarify, through a suitable $\\Gamma$-convergence\nanalysis, the way in which the original and modified PRL models interpolate\nbetween risk minimization and adversarial training.\n","authors":["Leon Bungert","Nicolás García Trillos","Matt Jacobs","Daniel McKenzie","Đorđe Nikolić","Qingsong Wang"],"pdf_url":"https://arxiv.org/pdf/2305.18779v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12059v1","updated":"2024-09-18T15:32:48Z","published":"2024-09-18T15:32:48Z","title":"Dual-Layer Training and Decoding of Large Language Model with\n Simultaneously Thinking and Speaking","summary":" Large Language Model can reasonably understand and generate human expressions\nbut may lack of thorough thinking and reasoning mechanisms. Recently there have\nbeen several studies which enhance the thinking ability of language models but\nmost of them are not data-driven or training-based. In this paper, we are\nmotivated by the cognitive mechanism in the natural world, and design a novel\nmodel architecture called TaS which allows it to first consider the thoughts\nand then express the response based upon the query. We design several pipelines\nto annotate or generate the thought contents from prompt-response samples, then\nadd language heads in a middle layer which behaves as the thinking layer. We\ntrain the language model by the thoughts-augmented data and successfully let\nthe thinking layer automatically generate reasonable thoughts and finally\noutput more reasonable responses. Both qualitative examples and quantitative\nresults validate the effectiveness and performance of TaS. Our code is\navailable at https://anonymous.4open.science/r/TadE.\n","authors":["Ningyuan Xi","Xiaoyu Wang","Yetao Wu","Teng Chen","Qingqing Gu","Jinxian Qu","Zhonglin Jiang","Yong Chen","Luo Ji"],"pdf_url":"https://arxiv.org/pdf/2409.12059v1.pdf","comment":"9 pages, 5 figures"},{"id":"http://arxiv.org/abs/2409.12057v1","updated":"2024-09-18T15:31:29Z","published":"2024-09-18T15:31:29Z","title":"Cartan moving frames and the data manifolds","summary":" The purpose of this paper is to employ the language of Cartan moving frames\nto study the geometry of the data manifolds and its Riemannian structure, via\nthe data information metric and its curvature at data points. Using this\nframework and through experiments, explanations on the response of a neural\nnetwork are given by pointing out the output classes that are easily reachable\nfrom a given input. This emphasizes how the proposed mathematical relationship\nbetween the output of the network and the geometry of its inputs can be\nexploited as an explainable artificial intelligence tool.\n","authors":["Eliot Tron","Rita Fioresi","Nicolas Couellan","Stéphane Puechmorel"],"pdf_url":"https://arxiv.org/pdf/2409.12057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19472v3","updated":"2024-09-18T15:30:33Z","published":"2023-05-31T00:55:40Z","title":"PlaSma: Making Small Language Models Better Procedural Knowledge Models\n for (Counterfactual) Planning","summary":" Procedural planning, which entails decomposing a high-level goal into a\nsequence of temporally ordered steps, is an important yet intricate task for\nmachines. It involves integrating common-sense knowledge to reason about\ncomplex and often contextualized situations, e.g. ``scheduling a doctor's\nappointment without a phone''. While current approaches show encouraging\nresults using large language models (LLMs), they are hindered by drawbacks such\nas costly API calls and reproducibility issues. In this paper, we advocate\nplanning using smaller language models. We present PlaSma, a novel two-pronged\napproach to endow small language models with procedural knowledge and\n(constrained) language planning capabilities. More concretely, we develop\nsymbolic procedural knowledge distillation to enhance the commonsense knowledge\nin small language models and an inference-time algorithm to facilitate more\nstructured and accurate reasoning. In addition, we introduce a new related\ntask, Replanning, that requires a revision of a plan to cope with a constrained\nsituation. In both the planning and replanning settings, we show that\norders-of-magnitude smaller models (770M-11B parameters) can compete and often\nsurpass their larger teacher models' capabilities. Finally, we showcase\nsuccessful application of PlaSma in an embodied environment, VirtualHome.\n","authors":["Faeze Brahman","Chandra Bhagavatula","Valentina Pyatkin","Jena D. Hwang","Xiang Lorraine Li","Hirona J. Arai","Soumya Sanyal","Keisuke Sakaguchi","Xiang Ren","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2305.19472v3.pdf","comment":"ICLR 2024 version , 31 pages"},{"id":"http://arxiv.org/abs/2409.12053v1","updated":"2024-09-18T15:26:15Z","published":"2024-09-18T15:26:15Z","title":"Extended Deep Submodular Functions","summary":" We introduce a novel category of set functions called Extended Deep\nSubmodular functions (EDSFs), which are neural network-representable. EDSFs\nserve as an extension of Deep Submodular Functions (DSFs), inheriting crucial\nproperties from DSFs while addressing innate limitations. It is known that DSFs\ncan represent a limiting subset of submodular functions. In contrast, through\nan analysis of polymatroid properties, we establish that EDSFs possess the\ncapability to represent all monotone submodular functions, a notable\nenhancement compared to DSFs. Furthermore, our findings demonstrate that EDSFs\ncan represent any monotone set function, indicating the family of EDSFs is\nequivalent to the family of all monotone set functions. Additionally, we prove\nthat EDSFs maintain the concavity inherent in DSFs when the components of the\ninput vector are non-negative real numbers-an essential feature in certain\ncombinatorial optimization problems. Through extensive experiments, we\nillustrate that EDSFs exhibit significantly lower empirical generalization\nerror than DSFs in the learning of coverage functions. This suggests that EDSFs\npresent a promising advancement in the representation and learning of set\nfunctions with improved generalization capabilities.\n","authors":["Seyed Mohammad Hosseini","Arash Jamshid","Seyed Mahdi Noormousavi","Mahdi Jafari Siavoshani","Naeimeh Omidvar"],"pdf_url":"https://arxiv.org/pdf/2409.12053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.14496v3","updated":"2024-09-18T15:25:49Z","published":"2024-08-23T16:33:57Z","title":"A New Era in Computational Pathology: A Survey on Foundation and\n Vision-Language Models","summary":" Recent advances in deep learning have completely transformed the domain of\ncomputational pathology (CPath). More specifically, it has altered the\ndiagnostic workflow of pathologists by integrating foundation models (FMs) and\nvision-language models (VLMs) in their assessment and decision-making process.\nThe limitations of existing deep learning approaches in CPath can be overcome\nby FMs through learning a representation space that can be adapted to a wide\nvariety of downstream tasks without explicit supervision. Deploying VLMs allow\npathology reports written in natural language be used as rich semantic\ninformation sources to improve existing models as well as generate predictions\nin natural language form. In this survey, a holistic and systematic overview of\nrecent innovations in FMs and VLMs in CPath is presented. Furthermore, the\ntools, datasets and training schemes for these models are summarized in\naddition to categorizing them into distinct groups. This extensive survey\nhighlights the current trends in CPath and its possible revolution through the\nuse of FMs and VLMs in the future.\n","authors":["Dibaloke Chanda","Milan Aryal","Nasim Yahya Soltani","Masoud Ganji"],"pdf_url":"https://arxiv.org/pdf/2408.14496v3.pdf","comment":"20 pages, 19 figures and 9 tables"},{"id":"http://arxiv.org/abs/2409.09263v3","updated":"2024-09-18T15:17:25Z","published":"2024-09-14T02:16:02Z","title":"Operational Wind Speed Forecasts for Chile's Electric Power Sector Using\n a Hybrid ML Model","summary":" As Chile's electric power sector advances toward a future powered by\nrenewable energy, accurate forecasting of renewable generation is essential for\nmanaging grid operations. The integration of renewable energy sources is\nparticularly challenging due to the operational difficulties of managing their\npower generation, which is highly variable compared to fossil fuel sources,\ndelaying the availability of clean energy. To mitigate this, we quantify the\nimpact of increasing intermittent generation from wind and solar on thermal\npower plants in Chile and introduce a hybrid wind speed forecasting methodology\nwhich combines two custom ML models for Chile. The first model is based on\nTiDE, an MLP-based ML model for short-term forecasts, and the second is based\non a graph neural network, GraphCast, for medium-term forecasts up to 10 days.\nOur hybrid approach outperforms the most accurate operational deterministic\nsystems by 4-21% for short-term forecasts and 5-23% for medium-term forecasts\nand can directly lower the impact of wind generation on thermal ramping,\ncurtailment, and system-level emissions in Chile.\n","authors":["Dhruv Suri","Praneet Dutta","Flora Xue","Ines Azevedo","Ravi Jain"],"pdf_url":"https://arxiv.org/pdf/2409.09263v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12043v1","updated":"2024-09-18T15:04:12Z","published":"2024-09-18T15:04:12Z","title":"Understanding the Effects of the Baidu-ULTR Logging Policy on Two-Tower\n Models","summary":" Despite the popularity of the two-tower model for unbiased learning to rank\n(ULTR) tasks, recent work suggests that it suffers from a major limitation that\ncould lead to its collapse in industry applications: the problem of logging\npolicy confounding. Several potential solutions have even been proposed;\nhowever, the evaluation of these methods was mostly conducted using\nsemi-synthetic simulation experiments. This paper bridges the gap between\ntheory and practice by investigating the confounding problem on the largest\nreal-world dataset, Baidu-ULTR. Our main contributions are threefold: 1) we\nshow that the conditions for the confounding problem are given on Baidu-ULTR,\n2) the confounding problem bears no significant effect on the two-tower model,\nand 3) we point to a potential mismatch between expert annotations, the golden\nstandard in ULTR, and user click behavior.\n","authors":["Morris de Haan","Philipp Hager"],"pdf_url":"https://arxiv.org/pdf/2409.12043v1.pdf","comment":"Accepted at the CONSEQUENCES '24 workshop, co-located with ACM RecSys\n '24"},{"id":"http://arxiv.org/abs/2405.15834v2","updated":"2024-09-18T14:59:09Z","published":"2024-05-24T09:15:29Z","title":"A Fisher-Rao gradient flow for entropic mean-field min-max games","summary":" Gradient flows play a substantial role in addressing many machine learning\nproblems. We examine the convergence in continuous-time of a\n\\textit{Fisher-Rao} (Mean-Field Birth-Death) gradient flow in the context of\nsolving convex-concave min-max games with entropy regularization. We propose\nappropriate Lyapunov functions to demonstrate convergence with explicit rates\nto the unique mixed Nash equilibrium.\n","authors":["Razvan-Andrei Lascu","Mateusz B. Majka","Łukasz Szpruch"],"pdf_url":"https://arxiv.org/pdf/2405.15834v2.pdf","comment":"24 pages. arXiv admin note: text overlap with arXiv:2306.03033"},{"id":"http://arxiv.org/abs/2409.12038v1","updated":"2024-09-18T14:57:13Z","published":"2024-09-18T14:57:13Z","title":"A Unified Framework for Neural Computation and Learning Over Time","summary":" This paper proposes Hamiltonian Learning, a novel unified framework for\nlearning with neural networks \"over time\", i.e., from a possibly infinite\nstream of data, in an online manner, without having access to future\ninformation. Existing works focus on the simplified setting in which the stream\nhas a known finite length or is segmented into smaller sequences, leveraging\nwell-established learning strategies from statistical machine learning. In this\npaper, the problem of learning over time is rethought from scratch, leveraging\ntools from optimal control theory, which yield a unifying view of the temporal\ndynamics of neural computations and learning. Hamiltonian Learning is based on\ndifferential equations that: (i) can be integrated without the need of external\nsoftware solvers; (ii) generalize the well-established notion of gradient-based\nlearning in feed-forward and recurrent networks; (iii) open to novel\nperspectives. The proposed framework is showcased by experimentally proving how\nit can recover gradient-based learning, comparing it to out-of-the box\noptimizers, and describing how it is flexible enough to switch from fully-local\nto partially/non-local computational schemes, possibly distributed over\nmultiple devices, and BackPropagation without storing activations. Hamiltonian\nLearning is easy to implement and can help researches approach in a principled\nand innovative manner the problem of learning over time.\n","authors":["Stefano Melacci","Alessandro Betti","Michele Casoni","Tommaso Guidi","Matteo Tiezzi","Marco Gori"],"pdf_url":"https://arxiv.org/pdf/2409.12038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12033v1","updated":"2024-09-18T14:49:25Z","published":"2024-09-18T14:49:25Z","title":"Topological Deep Learning with State-Space Models: A Mamba Approach for\n Simplicial Complexes","summary":" Graph Neural Networks based on the message-passing (MP) mechanism are a\ndominant approach for handling graph-structured data. However, they are\ninherently limited to modeling only pairwise interactions, making it difficult\nto explicitly capture the complexity of systems with $n$-body relations. To\naddress this, topological deep learning has emerged as a promising field for\nstudying and modeling higher-order interactions using various topological\ndomains, such as simplicial and cellular complexes. While these new domains\nprovide powerful representations, they introduce new challenges, such as\neffectively modeling the interactions among higher-order structures through\nhigher-order MP. Meanwhile, structured state-space sequence models have proven\nto be effective for sequence modeling and have recently been adapted for graph\ndata by encoding the neighborhood of a node as a sequence, thereby avoiding the\nMP mechanism. In this work, we propose a novel architecture designed to operate\nwith simplicial complexes, utilizing the Mamba state-space model as its\nbackbone. Our approach generates sequences for the nodes based on the\nneighboring cells, enabling direct communication between all higher-order\nstructures, regardless of their rank. We extensively validate our model,\ndemonstrating that it achieves competitive performance compared to\nstate-of-the-art models developed for simplicial complexes.\n","authors":["Marco Montagna","Simone Scardapane","Lev Telyatnikov"],"pdf_url":"https://arxiv.org/pdf/2409.12033v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.14496v2","updated":"2024-09-18T14:43:37Z","published":"2024-05-23T12:28:16Z","title":"Hybrid Top-Down Global Causal Discovery with Local Search for Linear and\n Nonlinear Additive Noise Models","summary":" Learning the unique directed acyclic graph corresponding to an unknown causal\nmodel is a challenging task. Methods based on functional causal models can\nidentify a unique graph, but either suffer from the curse of dimensionality or\nimpose strong parametric assumptions. To address these challenges, we propose a\nnovel hybrid approach for global causal discovery in observational data that\nleverages local causal substructures. We first present a topological sorting\nalgorithm that leverages ancestral relationships in linear structural equation\nmodels to establish a compact top-down hierarchical ordering, encoding more\ncausal information than linear orderings produced by existing methods. We\ndemonstrate that this approach generalizes to nonlinear settings with arbitrary\nnoise. We then introduce a nonparametric constraint-based algorithm that prunes\nspurious edges by searching for local conditioning sets, achieving greater\naccuracy than current methods. We provide theoretical guarantees for\ncorrectness and worst-case polynomial time complexities, with empirical\nvalidation on synthetic data.\n","authors":["Sujai Hiremath","Jacqueline R. M. A. Maasch","Mengxiao Gao","Promit Ghosal","Kyra Gan"],"pdf_url":"https://arxiv.org/pdf/2405.14496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12026v1","updated":"2024-09-18T14:36:50Z","published":"2024-09-18T14:36:50Z","title":"On Vision Transformers for Classification Tasks in Side-Scan Sonar\n Imagery","summary":" Side-scan sonar (SSS) imagery presents unique challenges in the\nclassification of man-made objects on the seafloor due to the complex and\nvaried underwater environments. Historically, experts have manually interpreted\nSSS images, relying on conventional machine learning techniques with\nhand-crafted features. While Convolutional Neural Networks (CNNs) significantly\nadvanced automated classification in this domain, they often fall short when\ndealing with diverse seafloor textures, such as rocky or ripple sand bottoms,\nwhere false positive rates may increase. Recently, Vision Transformers (ViTs)\nhave shown potential in addressing these limitations by utilizing a\nself-attention mechanism to capture global information in image patches,\noffering more flexibility in processing spatial hierarchies. This paper\nrigorously compares the performance of ViT models alongside commonly used CNN\narchitectures, such as ResNet and ConvNext, for binary classification tasks in\nSSS imagery. The dataset encompasses diverse geographical seafloor types and is\nbalanced between the presence and absence of man-made objects. ViT-based models\nexhibit superior classification performance across f1-score, precision, recall,\nand accuracy metrics, although at the cost of greater computational resources.\nCNNs, with their inductive biases, demonstrate better computational efficiency,\nmaking them suitable for deployment in resource-constrained environments like\nunderwater vehicles. Future research directions include exploring\nself-supervised learning for ViTs and multi-modal fusion to further enhance\nperformance in challenging underwater environments.\n","authors":["BW Sheffield","Jeffrey Ellen","Ben Whitmore"],"pdf_url":"https://arxiv.org/pdf/2409.12026v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12020v1","updated":"2024-09-18T14:30:48Z","published":"2024-09-18T14:30:48Z","title":"Promise and Peril of Collaborative Code Generation Models: Balancing\n Effectiveness and Memorization","summary":" In the rapidly evolving field of machine learning, training models with\ndatasets from various locations and organizations presents significant\nchallenges due to privacy and legal concerns. The exploration of effective\ncollaborative training settings capable of leveraging valuable knowledge from\ndistributed and isolated datasets is increasingly crucial. This study\ninvestigates key factors that impact the effectiveness of collaborative\ntraining methods in code next-token prediction, as well as the correctness and\nutility of the generated code, demonstrating the promise of such methods.\nAdditionally, we evaluate the memorization of different participant training\ndata across various collaborative training settings, including centralized,\nfederated, and incremental training, highlighting their potential risks in\nleaking data. Our findings indicate that the size and diversity of code\ndatasets are pivotal factors influencing the success of collaboratively trained\ncode models. We show that federated learning achieves competitive performance\ncompared to centralized training while offering better data protection, as\nevidenced by lower memorization ratios in the generated code. However,\nfederated learning can still produce verbatim code snippets from hidden\ntraining data, potentially violating privacy or copyright. Our study further\nexplores effectiveness and memorization patterns in incremental learning,\nemphasizing the sequence in which individual participant datasets are\nintroduced. We also identify cross-organizational clones as a prevalent\nchallenge in both centralized and federated learning scenarios. Our findings\nhighlight the persistent risk of data leakage during inference, even when\ntraining data remains unseen. We conclude with recommendations for\npractitioners and researchers to optimize multisource datasets, propelling\ncross-organizational collaboration forward.\n","authors":["Zhi Chen","Lingxiao Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.12020v1.pdf","comment":"Paper accepted to the ASE 2024 Conference Research Track"},{"id":"http://arxiv.org/abs/2409.12015v1","updated":"2024-09-18T14:29:14Z","published":"2024-09-18T14:29:14Z","title":"All-in-one foundational models learning across quantum chemical levels","summary":" Machine learning (ML) potentials typically target a single quantum chemical\n(QC) level while the ML models developed for multi-fidelity learning have not\nbeen shown to provide scalable solutions for foundational models. Here we\nintroduce the all-in-one (AIO) ANI model architecture based on multimodal\nlearning which can learn an arbitrary number of QC levels. Our all-in-one\nlearning approach offers a more general and easier-to-use alternative to\ntransfer learning. We use it to train the AIO-ANI-UIP foundational model with\nthe generalization capability comparable to semi-empirical GFN2-xTB and DFT\nwith a double-zeta basis set for organic molecules. We show that the AIO-ANI\nmodel can learn across different QC levels ranging from semi-empirical to\ndensity functional theory to coupled cluster. We also use AIO models to design\nthe foundational model {\\Delta}-AIO-ANI based on {\\Delta}-learning with\nincreased accuracy and robustness compared to AIO-ANI-UIP. The code and the\nfoundational models are available at https://github.com/dralgroup/aio-ani; they\nwill be integrated into the universal and updatable AI-enhanced QM (UAIQM)\nlibrary and made available in the MLatom package so that they can be used\nonline at the XACS cloud computing platform (see\nhttps://github.com/dralgroup/mlatom for updates).\n","authors":["Yuxinxin Chen","Pavlo O. Dral"],"pdf_url":"https://arxiv.org/pdf/2409.12015v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.01535v3","updated":"2024-09-18T14:20:32Z","published":"2024-03-03T15:28:47Z","title":"Neural Graph Generator: Feature-Conditioned Graph Generation using\n Latent Diffusion Models","summary":" Graph generation has emerged as a crucial task in machine learning, with\nsignificant challenges in generating graphs that accurately reflect specific\nproperties. Existing methods often fall short in efficiently addressing this\nneed as they struggle with the high-dimensional complexity and varied nature of\ngraph properties. In this paper, we introduce the Neural Graph Generator (NGG),\na novel approach which utilizes conditioned latent diffusion models for graph\ngeneration. NGG demonstrates a remarkable capacity to model complex graph\npatterns, offering control over the graph generation process. NGG employs a\nvariational graph autoencoder for graph compression and a diffusion process in\nthe latent vector space, guided by vectors summarizing graph statistics. We\ndemonstrate NGG's versatility across various graph generation tasks, showing\nits capability to capture desired graph properties and generalize to unseen\ngraphs. We also compare our generator to the graph generation capabilities of\ndifferent LLMs. This work signifies a shift in graph generation methodologies,\noffering a more practical and efficient solution for generating diverse graphs\nwith specific characteristics.\n","authors":["Iakovos Evdaimon","Giannis Nikolentzos","Christos Xypolopoulos","Ahmed Kammoun","Michail Chatzianastasis","Hadi Abdine","Michalis Vazirgiannis"],"pdf_url":"https://arxiv.org/pdf/2403.01535v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12001v1","updated":"2024-09-18T14:13:24Z","published":"2024-09-18T14:13:24Z","title":"Putting Data at the Centre of Offline Multi-Agent Reinforcement Learning","summary":" Offline multi-agent reinforcement learning (MARL) is an exciting direction of\nresearch that uses static datasets to find optimal control policies for\nmulti-agent systems. Though the field is by definition data-driven, efforts\nhave thus far neglected data in their drive to achieve state-of-the-art\nresults. We first substantiate this claim by surveying the literature, showing\nhow the majority of works generate their own datasets without consistent\nmethodology and provide sparse information about the characteristics of these\ndatasets. We then show why neglecting the nature of the data is problematic,\nthrough salient examples of how tightly algorithmic performance is coupled to\nthe dataset used, necessitating a common foundation for experiments in the\nfield. In response, we take a big step towards improving data usage and data\nawareness in offline MARL, with three key contributions: (1) a clear guideline\nfor generating novel datasets; (2) a standardisation of over 80 existing\ndatasets, hosted in a publicly available repository, using a consistent storage\nformat and easy-to-use API; and (3) a suite of analysis tools that allow us to\nunderstand these datasets better, aiding further development.\n","authors":["Claude Formanek","Louise Beyers","Callum Rhys Tilbury","Jonathan P. Shock","Arnu Pretorius"],"pdf_url":"https://arxiv.org/pdf/2409.12001v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12000v1","updated":"2024-09-18T14:12:01Z","published":"2024-09-18T14:12:01Z","title":"\"It Might be Technically Impressive, But It's Practically Useless to\n Us\": Practices, Challenges, and Opportunities for Cross-Functional\n Collaboration around AI within the News Industry","summary":" Recently, an increasing number of news organizations have integrated\nartificial intelligence (AI) into their workflows, leading to a further influx\nof AI technologists and data workers into the news industry. This has initiated\ncross-functional collaborations between these professionals and journalists.\nWhile prior research has explored the impact of AI-related roles entering the\nnews industry, there is a lack of studies on how cross-functional collaboration\nunfolds between AI professionals and journalists. Through interviews with 17\njournalists, 6 AI technologists, and 3 AI workers with cross-functional\nexperience from leading news organizations, we investigate the current\npractices, challenges, and opportunities for cross-functional collaboration\naround AI in today's news industry. We first study how journalists and AI\nprofessionals perceive existing cross-collaboration strategies. We further\nexplore the challenges of cross-functional collaboration and provide\nrecommendations for enhancing future cross-functional collaboration around AI\nin the news industry.\n","authors":["Qing Xiao","Xianzhe Fan","Felix M. Simon","Bingbing Zhang","Motahhare Eslami"],"pdf_url":"https://arxiv.org/pdf/2409.12000v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2312.11582v2","updated":"2024-09-18T14:05:32Z","published":"2023-12-18T11:37:19Z","title":"Shapley-PC: Constraint-based Causal Structure Learning with Shapley\n Values","summary":" Causal Structure Learning (CSL), also referred to as causal discovery,\namounts to extracting causal relations among variables in data. CSL enables the\nestimation of causal effects from observational data alone, avoiding the need\nto perform real life experiments. Constraint-based CSL leverages conditional\nindependence tests to perform causal discovery. We propose Shapley-PC, a novel\nmethod to improve constraint-based CSL algorithms by using Shapley values over\nthe possible conditioning sets, to decide which variables are responsible for\nthe observed conditional (in)dependences. We prove soundness, completeness and\nasymptotic consistency of Shapley-PC and run a simulation study showing that\nour proposed algorithm is superior to existing versions of PC.\n","authors":["Fabrizio Russo","Francesca Toni"],"pdf_url":"https://arxiv.org/pdf/2312.11582v2.pdf","comment":"21 pages (with appendix)"},{"id":"http://arxiv.org/abs/2404.00297v4","updated":"2024-09-18T14:05:31Z","published":"2024-03-30T09:20:43Z","title":"A Hybrid Transformer and Attention Based Recurrent Neural Network for\n Robust and Interpretable Sentiment Analysis of Tweets","summary":" Sentiment analysis is crucial for understanding public opinion and consumer\nbehavior. Existing models face challenges with linguistic diversity,\ngeneralizability, and explainability. We propose TRABSA, a hybrid framework\nintegrating transformer-based architectures, attention mechanisms, and BiLSTM\nnetworks to address this. Leveraging RoBERTa-trained on 124M tweets, we bridge\ngaps in sentiment analysis benchmarks, ensuring state-of-the-art accuracy.\nAugmenting datasets with tweets from 32 countries and US states, we compare six\nword-embedding techniques and three lexicon-based labeling techniques,\nselecting the best for optimal sentiment analysis. TRABSA outperforms\ntraditional ML and deep learning models with 94% accuracy and significant\nprecision, recall, and F1-score gains. Evaluation across diverse datasets\ndemonstrates consistent superiority and generalizability. SHAP and LIME\nanalyses enhance interpretability, improving confidence in predictions. Our\nstudy facilitates pandemic resource management, aiding resource planning,\npolicy formation, and vaccination tactics.\n","authors":["Md Abrar Jahin","Md Sakib Hossain Shovon","M. F. Mridha","Md Rashedul Islam","Yutaka Watanobe"],"pdf_url":"https://arxiv.org/pdf/2404.00297v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11995v1","updated":"2024-09-18T14:04:15Z","published":"2024-09-18T14:04:15Z","title":"Unraveling the Hessian: A Key to Smooth Convergence in Loss Function\n Landscapes","summary":" The loss landscape of neural networks is a critical aspect of their training,\nand understanding its properties is essential for improving their performance.\nIn this paper, we investigate how the loss surface changes when the sample size\nincreases, a previously unexplored issue. We theoretically analyze the\nconvergence of the loss landscape in a fully connected neural network and\nderive upper bounds for the difference in loss function values when adding a\nnew object to the sample. Our empirical study confirms these results on various\ndatasets, demonstrating the convergence of the loss function surface for image\nclassification tasks. Our findings provide insights into the local geometry of\nneural loss landscapes and have implications for the development of sample size\ndetermination techniques.\n","authors":["Nikita Kiselev","Andrey Grabovoy"],"pdf_url":"https://arxiv.org/pdf/2409.11995v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.11973v5","updated":"2024-09-18T14:02:13Z","published":"2023-12-19T09:11:49Z","title":"Continual Learning: Forget-free Winning Subnetworks for Video\n Representations","summary":" Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the\nexistence of efficient subnetworks within larger, dense networks, a\nhigh-performing Winning Subnetwork (WSN) in terms of task performance under\nappropriate sparsity conditions is considered for various continual learning\ntasks. It leverages pre-existing weights from dense networks to achieve\nefficient learning in Task Incremental Learning (TIL) and Task-agnostic\nIncremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning\n(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is\ndesigned to prevent overfitting when the data samples are scarce. Furthermore,\nthe sparse reuse of WSN weights is considered for Video Incremental Learning\n(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It\nenables compact encoding of videos and identifies reusable subnetworks across\nvarying bandwidths. We have integrated FSO into different architectural\nframeworks for continual learning, including VIL, TIL, and FSCIL. Our\ncomprehensive experiments demonstrate FSO's effectiveness, significantly\nimproving task performance at various convolutional representational levels.\nSpecifically, FSO enhances higher-layer performance in TIL and FSCIL and\nlower-layer performance in VIL.\n","authors":["Haeyong Kang","Jaehong Yoon","Sung Ju Hwang","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2312.11973v5.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2303.14962,\n arXiv:2306.11305"},{"id":"http://arxiv.org/abs/2407.20784v2","updated":"2024-09-18T14:01:47Z","published":"2024-07-27T15:41:13Z","title":"Inverse Problems with Diffusion Models: A MAP Estimation Perspective","summary":" Inverse problems have many applications in science and engineering. In\nComputer vision, several image restoration tasks such as inpainting,\ndeblurring, and super-resolution can be formally modeled as inverse problems.\nRecently, methods have been developed for solving inverse problems that only\nleverage a pre-trained unconditional diffusion model and do not require\nadditional task-specific training. In such methods, however, the inherent\nintractability of determining the conditional score function during the reverse\ndiffusion process poses a real challenge, leaving the methods to settle with an\napproximation instead, which affects their performance in practice. Here, we\npropose a MAP estimation framework to model the reverse conditional generation\nprocess of a continuous time diffusion model as an optimization process of the\nunderlying MAP objective, whose gradient term is tractable. In theory, the\nproposed framework can be applied to solve general inverse problems using\ngradient-based optimization methods. However, given the highly non-convex\nnature of the loss objective, finding a perfect gradient-based optimization\nalgorithm can be quite challenging, nevertheless, our framework offers several\npotential research directions. We use our proposed formulation to develop\nempirically effective algorithms for image restoration. We validate our\nproposed algorithms with extensive experiments over multiple datasets across\nseveral restoration tasks.\n","authors":["Sai Bharath Chandra Gutha","Ricardo Vinuesa","Hossein Azizpour"],"pdf_url":"https://arxiv.org/pdf/2407.20784v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.00099v3","updated":"2024-09-18T13:45:08Z","published":"2024-04-30T18:00:02Z","title":"Creative Beam Search: LLM-as-a-Judge For Improving Response Generation","summary":" Large language models are revolutionizing several areas, including artificial\ncreativity. However, the process of generation in machines profoundly diverges\nfrom that observed in humans. In particular, machine generation is\ncharacterized by a lack of intentionality and an underlying creative process.\nWe propose a method called Creative Beam Search that uses Diverse Beam Search\nand LLM-as-a-Judge to perform response generation and response validation. The\nresults of a qualitative experiment show how our approach can provide better\noutput than standard sampling techniques. We also show that the response\nvalidation step is a necessary complement to the response generation step.\n","authors":["Giorgio Franceschelli","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2405.00099v3.pdf","comment":"Presented as a short paper at the 15th International Conference on\n Computational Creativity (ICCC'24)"},{"id":"http://arxiv.org/abs/2409.11985v1","updated":"2024-09-18T13:43:39Z","published":"2024-09-18T13:43:39Z","title":"An Efficient Model-Agnostic Approach for Uncertainty Estimation in\n Data-Restricted Pedometric Applications","summary":" This paper introduces a model-agnostic approach designed to enhance\nuncertainty estimation in the predictive modeling of soil properties, a crucial\nfactor for advancing pedometrics and the practice of digital soil mapping. For\naddressing the typical challenge of data scarcity in soil studies, we present\nan improved technique for uncertainty estimation. This method is based on the\ntransformation of regression tasks into classification problems, which not only\nallows for the production of reliable uncertainty estimates but also enables\nthe application of established machine learning algorithms with competitive\nperformance that have not yet been utilized in pedometrics. Empirical results\nfrom datasets collected from two German agricultural fields showcase the\npractical application of the proposed methodology. Our results and findings\nsuggest that the proposed approach has the potential to provide better\nuncertainty estimation than the models commonly used in pedometrics.\n","authors":["Viacheslav Barkov","Jonas Schmidinger","Robin Gebbers","Martin Atzmueller"],"pdf_url":"https://arxiv.org/pdf/2409.11985v1.pdf","comment":"To be published in the proceedings of ICMLA 2024: 23rd International\n Conference on Machine Learning and Applications"},{"id":"http://arxiv.org/abs/2407.13493v2","updated":"2024-09-18T13:41:20Z","published":"2024-07-18T13:23:16Z","title":"Training Foundation Models as Data Compression: On Information, Model\n Weights and Copyright Law","summary":" The training process of foundation models as for other classes of deep\nlearning systems is based on minimizing the reconstruction error over a\ntraining set. For this reason, they are susceptible to the memorization and\nsubsequent reproduction of training samples. In this paper, we introduce a\ntraining-as-compressing perspective, wherein the model's weights embody a\ncompressed representation of the training data. From a copyright standpoint,\nthis point of view implies that the weights could be considered a reproduction\nor a derivative work of a potentially protected set of works. We investigate\nthe technical and legal challenges that emerge from this framing of the\ncopyright of outputs generated by foundation models, including their\nimplications for practitioners and researchers. We demonstrate that adopting an\ninformation-centric approach to the problem presents a promising pathway for\ntackling these emerging complex legal issues.\n","authors":["Giorgio Franceschelli","Claudia Cevenini","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2407.13493v2.pdf","comment":"Spotlight presentation at GenLaw'24, see\n https://www.genlaw.org/2024-icml-papers#training-foundation-models-as-data-compression-on-information-model-weights-and-copyright-law"},{"id":"http://arxiv.org/abs/2306.06402v2","updated":"2024-09-18T13:32:45Z","published":"2023-06-10T10:04:54Z","title":"A Single-Loop Deep Actor-Critic Algorithm for Constrained Reinforcement\n Learning with Provable Convergence","summary":" Deep Actor-Critic algorithms, which combine Actor-Critic with deep neural\nnetwork (DNN), have been among the most prevalent reinforcement learning\nalgorithms for decision-making problems in simulated environments. However, the\nexisting deep Actor-Critic algorithms are still not mature to solve realistic\nproblems with non-convex stochastic constraints and high cost to interact with\nthe environment. In this paper, we propose a single-loop deep Actor-Critic\n(SLDAC) algorithmic framework for general constrained reinforcement learning\n(CRL) problems. In the actor step, the constrained stochastic successive convex\napproximation (CSSCA) method is applied to handle the non-convex stochastic\nobjective and constraints. In the critic step, the critic DNNs are only updated\nonce or a few finite times for each iteration, which simplifies the algorithm\nto a single-loop framework (the existing works require a sufficient number of\nupdates for the critic step to ensure a good enough convergence of the inner\nloop for each iteration). Moreover, the variance of the policy gradient\nestimation is reduced by reusing observations from the old policy. The\nsingle-loop design and the observation reuse effectively reduce the\nagent-environment interaction cost and computational complexity. In spite of\nthe biased policy gradient estimation incurred by the single-loop design and\nobservation reuse, we prove that the SLDAC with a feasible initial point can\nconverge to a Karush-Kuhn-Tuker (KKT) point of the original problem almost\nsurely. Simulations show that the SLDAC algorithm can achieve superior\nperformance with much lower interaction cost.\n","authors":["Kexuan Wang","An Liu","Baishuo Lin"],"pdf_url":"https://arxiv.org/pdf/2306.06402v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10576v2","updated":"2024-09-18T13:27:43Z","published":"2024-09-15T15:21:45Z","title":"Language Models and Retrieval Augmented Generation for Automated\n Structured Data Extraction from Diagnostic Reports","summary":" Purpose: To develop and evaluate an automated system for extracting\nstructured clinical information from unstructured radiology and pathology\nreports using open-weights large language models (LMs) and retrieval augmented\ngeneration (RAG), and to assess the effects of model configuration variables on\nextraction performance. Methods and Materials: The study utilized two datasets:\n7,294 radiology reports annotated for Brain Tumor Reporting and Data System\n(BT-RADS) scores and 2,154 pathology reports annotated for isocitrate\ndehydrogenase (IDH) mutation status. An automated pipeline was developed to\nbenchmark the performance of various LMs and RAG configurations. The impact of\nmodel size, quantization, prompting strategies, output formatting, and\ninference parameters was systematically evaluated. Results: The best performing\nmodels achieved over 98% accuracy in extracting BT-RADS scores from radiology\nreports and over 90% for IDH mutation status extraction from pathology reports.\nThe top model being medical fine-tuned llama3. Larger, newer, and domain\nfine-tuned models consistently outperformed older and smaller models. Model\nquantization had minimal impact on performance. Few-shot prompting\nsignificantly improved accuracy. RAG improved performance for complex pathology\nreports but not for shorter radiology reports. Conclusions: Open LMs\ndemonstrate significant potential for automated extraction of structured\nclinical data from unstructured clinical reports with local privacy-preserving\napplication. Careful model selection, prompt engineering, and semi-automated\noptimization using annotated data are critical for optimal performance. These\napproaches could be reliable enough for practical use in research workflows,\nhighlighting the potential for human-machine collaboration in healthcare data\nextraction.\n","authors":["Mohamed Sobhi Jabal","Pranav Warman","Jikai Zhang","Kartikeye Gupta","Ayush Jain","Maciej Mazurowski","Walter Wiggins","Kirti Magudia","Evan Calabrese"],"pdf_url":"https://arxiv.org/pdf/2409.10576v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11059v2","updated":"2024-09-18T13:27:39Z","published":"2024-09-17T10:38:46Z","title":"OneEncoder: A Lightweight Framework for Progressive Alignment of\n Modalities","summary":" Cross-modal alignment Learning integrates information from different\nmodalities like text, image, audio and video to create unified models. This\napproach develops shared representations and learns correlations between\nmodalities, enabling applications such as visual question answering and\naudiovisual content analysis. Current techniques rely on large\nmodality-specific encoders, necessitating fine-tuning or training from scratch\non vast aligned datasets (e.g., text-image, text-audio, image-audio). This\napproach has limitations: (i) it is very expensive due to the need for training\nlarge encoders on extensive datasets, (ii) acquiring aligned large paired\ndatasets is challenging, and (iii) adding new modalities requires retraining\nthe entire framework to incorporate these modalities. To address these issues,\nwe propose OneEncoder, a lightweight framework that progressively represents\nand aligns four modalities (image, text, audio, video). Initially, we train a\nlightweight Universal Projection module (UP) to align image and text\nmodalities. Then, we freeze the pretrained UP and progressively align future\nmodalities to those already aligned. OneEncoder operates efficiently and\ncost-effectively, even in scenarios where vast aligned datasets are\nunavailable, due to its lightweight design. Trained on small paired datasets,\nit shows strong performance in tasks like classification, querying, and visual\nquestion answering, surpassing methods that rely on large datasets and\nspecialized encoders.\n","authors":["Bilal Faye","Hanane Azzag","Mustapha Lebbah"],"pdf_url":"https://arxiv.org/pdf/2409.11059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11972v1","updated":"2024-09-18T13:24:44Z","published":"2024-09-18T13:24:44Z","title":"Metric-Semantic Factor Graph Generation based on Graph Neural Networks","summary":" Understanding the relationships between geometric structures and semantic\nconcepts is crucial for building accurate models of complex environments. In\nindoors, certain spatial constraints, such as the relative positioning of\nplanes, remain consistent despite variations in layout. This paper explores how\nthese invariant relationships can be captured in a graph SLAM framework by\nrepresenting high-level concepts like rooms and walls, linking them to\ngeometric elements like planes through an optimizable factor graph. Several\nefforts have tackled this issue with add-hoc solutions for each concept\ngeneration and with manually-defined factors.\n This paper proposes a novel method for metric-semantic factor graph\ngeneration which includes defining a semantic scene graph, integrating\ngeometric information, and learning the interconnecting factors, all based on\nGraph Neural Networks (GNNs). An edge classification network (G-GNN) sorts the\nedges between planes into same room, same wall or none types. The resulting\nrelations are clustered, generating a room or wall for each cluster. A second\nfamily of networks (F-GNN) infers the geometrical origin of the new nodes. The\ndefinition of the factors employs the same F-GNN used for the metric attribute\nof the generated nodes. Furthermore, share the new factor graph with the\nS-Graphs+ algorithm, extending its graph expressiveness and scene\nrepresentation with the ultimate goal of improving the SLAM performance. The\ncomplexity of the environments is increased to N-plane rooms by training the\nnetworks on L-shaped rooms. The framework is evaluated in synthetic and\nsimulated scenarios as no real datasets of the required complex layouts are\navailable.\n","authors":["Jose Andres Millan-Romera","Hriday Bavle","Muhammad Shaheer","Holger Voos","Jose Luis Sanchez-Lopez"],"pdf_url":"https://arxiv.org/pdf/2409.11972v1.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2409.11968v1","updated":"2024-09-18T13:20:23Z","published":"2024-09-18T13:20:23Z","title":"Efficacy of Synthetic Data as a Benchmark","summary":" Large language models (LLMs) have enabled a range of applications in\nzero-shot and few-shot learning settings, including the generation of synthetic\ndatasets for training and testing. However, to reliably use these synthetic\ndatasets, it is essential to understand how representative they are of\nreal-world data. We investigate this by assessing the effectiveness of\ngenerating synthetic data through LLM and using it as a benchmark for various\nNLP tasks. Our experiments across six datasets, and three different tasks, show\nthat while synthetic data can effectively capture performance of various\nmethods for simpler tasks, such as intent classification, it falls short for\nmore complex tasks like named entity recognition. Additionally, we propose a\nnew metric called the bias factor, which evaluates the biases introduced when\nthe same LLM is used to both generate benchmarking data and to perform the\ntasks. We find that smaller LLMs exhibit biases towards their own generated\ndata, whereas larger models do not. Overall, our findings suggest that the\neffectiveness of synthetic data as a benchmark varies depending on the task,\nand that practitioners should rely on data generated from multiple larger\nmodels whenever possible.\n","authors":["Gaurav Maheshwari","Dmitry Ivanov","Kevin El Haddad"],"pdf_url":"https://arxiv.org/pdf/2409.11968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11964v1","updated":"2024-09-18T13:16:00Z","published":"2024-09-18T13:16:00Z","title":"Data Efficient Acoustic Scene Classification using Teacher-Informed\n Confusing Class Instruction","summary":" In this technical report, we describe the SNTL-NTU team's submission for Task\n1 Data-Efficient Low-Complexity Acoustic Scene Classification of the detection\nand classification of acoustic scenes and events (DCASE) 2024 challenge. Three\nsystems are introduced to tackle training splits of different sizes. For small\ntraining splits, we explored reducing the complexity of the provided baseline\nmodel by reducing the number of base channels. We introduce data augmentation\nin the form of mixup to increase the diversity of training samples. For the\nlarger training splits, we use FocusNet to provide confusing class information\nto an ensemble of multiple Patchout faSt Spectrogram Transformer (PaSST) models\nand baseline models trained on the original sampling rate of 44.1 kHz. We use\nKnowledge Distillation to distill the ensemble model to the baseline student\nmodel. Training the systems on the TAU Urban Acoustic Scene 2022 Mobile\ndevelopment dataset yielded the highest average testing accuracy of (62.21,\n59.82, 56.81, 53.03, 47.97)% on split (100, 50, 25, 10, 5)% respectively over\nthe three systems.\n","authors":["Jin Jie Sean Yeo","Ee-Leng Tan","Jisheng Bai","Santi Peksi","Woon-Seng Gan"],"pdf_url":"https://arxiv.org/pdf/2409.11964v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.10516v2","updated":"2024-09-18T13:11:13Z","published":"2024-09-16T17:59:52Z","title":"RetrievalAttention: Accelerating Long-Context LLM Inference via Vector\n Retrieval","summary":" Transformer-based Large Language Models (LLMs) have become increasingly\nimportant. However, due to the quadratic time complexity of attention\ncomputation, scaling LLMs to longer contexts incurs extremely slow inference\nlatency and high GPU memory consumption for caching key-value (KV) vectors.\nThis paper proposes RetrievalAttention, a training-free approach to both\naccelerate attention computation and reduce GPU memory consumption. By\nleveraging the dynamic sparsity of attention mechanism, RetrievalAttention\nproposes to use approximate nearest neighbor search (ANNS) indexes for KV\nvectors in CPU memory and retrieves the most relevant ones with vector search\nduring generation. Unfortunately, we observe that the off-the-shelf ANNS\nindexes are often ineffective for such retrieval tasks due to the\nout-of-distribution (OOD) between query vectors and key vectors in attention\nmechanism. RetrievalAttention addresses the OOD challenge by designing an\nattention-aware vector search algorithm that can adapt to the distribution of\nquery vectors. Our evaluation shows that RetrievalAttention only needs to\naccess 1--3% of data while maintaining high model accuracy. This leads to\nsignificant reduction in the inference cost of long-context LLMs with much\nlower GPU memory footprint. In particular, RetrievalAttention only needs a\nsingle NVIDIA RTX4090 (24GB) for serving 128K tokens in LLMs with 8B\nparameters, which is capable of generating one token in 0.188 seconds.\n","authors":["Di Liu","Meng Chen","Baotong Lu","Huiqiang Jiang","Zhenhua Han","Qianxi Zhang","Qi Chen","Chengruidong Zhang","Bailu Ding","Kai Zhang","Chen Chen","Fan Yang","Yuqing Yang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2409.10516v2.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2312.02515v2","updated":"2024-09-18T13:07:28Z","published":"2023-12-05T05:38:38Z","title":"mLoRA: Fine-Tuning LoRA Adapters via Highly-Efficient Pipeline\n Parallelism in Multiple GPUs","summary":" Transformer-based, pre-trained large language models (LLMs) have demonstrated\noutstanding performance across diverse domains, particularly in the emerging\n{\\em pretrain-then-finetune} paradigm. Low-Rank Adaptation (LoRA), a\nparameter-efficient fine-tuning method, is commonly used to adapt a base LLM to\nmultiple downstream tasks. Further, LLM platforms enable developers to\nfine-tune multiple models and develop various domain-specific applications\nsimultaneously. However, existing model parallelism schemes suffer from high\ncommunication overhead and inefficient GPU utilization when training multiple\nLoRA tasks across GPUs and machines.\n In this paper, we present mLoRA, a parallelism-efficient fine-tuning system\ndesigned for training multiple LoRA across GPUs and machines. mLoRA introduces\na novel LoRA-aware pipeline parallelism scheme that efficiently pipelines\nindependent LoRA adapters and their distinct fine-tuning stages across GPUs and\nmachines, along with a new LoRA-efficient operator to enhance GPU utilization\nduring pipelined LoRA training. Our extensive evaluation shows that mLoRA can\nsignificantly reduce average fine-tuning task completion time, e.g., by 30\\%,\ncompared to state-of-the-art methods like FSDP. More importantly, mLoRA enables\nsimultaneous fine-tuning of larger models, e.g., two Llama-2-13B models on four\nNVIDIA RTX A6000 48GB GPUs, which is not feasible for FSDP due to high memory\nrequirements. Hence, mLoRA not only increases fine-tuning efficiency but also\nmakes it more accessible on cost-effective GPUs. mLoRA has been deployed in\nAntGroup's production environment.\n","authors":["Zhengmao Ye","Dengchun Li","Zetao Hu","Tingfeng Lan","Jian Sha","Sicong Zhang","Lei Duan","Jie Zuo","Hui Lu","Yuanchun Zhou","Mingjie Tang"],"pdf_url":"https://arxiv.org/pdf/2312.02515v2.pdf","comment":"14 pages, 16 figures"},{"id":"http://arxiv.org/abs/2308.16534v2","updated":"2024-09-18T13:06:30Z","published":"2023-08-31T08:25:47Z","title":"Zero-Shot Conditioning of Score-Based Diffusion Models by Neuro-Symbolic\n Constraints","summary":" Score-based diffusion models have emerged as effective approaches for both\nconditional and unconditional generation. Still conditional generation is based\non either a specific training of a conditional model or classifier guidance,\nwhich requires training a noise-dependent classifier, even when a classifier\nfor uncorrupted data is given. We propose a method that, given a pre-trained\nunconditional score-based generative model, samples from the conditional\ndistribution under arbitrary logical constraints, without requiring additional\ntraining. Differently from other zero-shot techniques, that rather aim at\ngenerating valid conditional samples, our method is designed for approximating\nthe true conditional distribution. Firstly, we show how to manipulate the\nlearned score in order to sample from an un-normalized distribution conditional\non a user-defined constraint. Then, we define a flexible and numerically stable\nneuro-symbolic framework for encoding soft logical constraints. Combining these\ntwo ingredients we obtain a general, but approximate, conditional sampling\nalgorithm. We further developed effective heuristics aimed at improving the\napproximation. Finally, we show the effectiveness of our approach in\napproximating conditional distributions for various types of constraints and\ndata: tabular data, images and time series.\n","authors":["Davide Scassola","Sebastiano Saccani","Ginevra Carbone","Luca Bortolussi"],"pdf_url":"https://arxiv.org/pdf/2308.16534v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11935v1","updated":"2024-09-18T12:50:28Z","published":"2024-09-18T12:50:28Z","title":"Reinforcement Learning with Lie Group Orientations for Robotics","summary":" Handling orientations of robots and objects is a crucial aspect of many\napplications. Yet, ever so often, there is a lack of mathematical correctness\nwhen dealing with orientations, especially in learning pipelines involving, for\nexample, artificial neural networks. In this paper, we investigate\nreinforcement learning with orientations and propose a simple modification of\nthe network's input and output that adheres to the Lie group structure of\norientations. As a result, we obtain an easy and efficient implementation that\nis directly usable with existing learning libraries and achieves significantly\nbetter performance than other common orientation representations. We briefly\nintroduce Lie theory specifically for orientations in robotics to motivate and\noutline our approach. Subsequently, a thorough empirical evaluation of\ndifferent combinations of orientation representations for states and actions\ndemonstrates the superior performance of our proposed approach in different\nscenarios, including: direct orientation control, end effector orientation\ncontrol, and pick-and-place tasks.\n","authors":["Martin Schuck","Jan Brüdigam","Sandra Hirche","Angela Schoellig"],"pdf_url":"https://arxiv.org/pdf/2409.11935v1.pdf","comment":"Submitted to ICRA 2025"},{"id":"http://arxiv.org/abs/2409.11933v1","updated":"2024-09-18T12:48:56Z","published":"2024-09-18T12:48:56Z","title":"Reinforcement Learning as an Improvement Heuristic for Real-World\n Production Scheduling","summary":" The integration of Reinforcement Learning (RL) with heuristic methods is an\nemerging trend for solving optimization problems, which leverages RL's ability\nto learn from the data generated during the search process. One promising\napproach is to train an RL agent as an improvement heuristic, starting with a\nsuboptimal solution that is iteratively improved by applying small changes. We\napply this approach to a real-world multiobjective production scheduling\nproblem. Our approach utilizes a network architecture that includes Transformer\nencoding to learn the relationships between jobs. Afterwards, a probability\nmatrix is generated from which pairs of jobs are sampled and then swapped to\nimprove the solution. We benchmarked our approach against other heuristics\nusing real data from our industry partner, demonstrating its superior\nperformance.\n","authors":["Arthur Müller","Lukas Vollenkemper"],"pdf_url":"https://arxiv.org/pdf/2409.11933v1.pdf","comment":"This paper was accepted at the ICMLA 2024"},{"id":"http://arxiv.org/abs/2409.11929v1","updated":"2024-09-18T12:41:56Z","published":"2024-09-18T12:41:56Z","title":"An Explainable Machine Learning Approach to Traffic Accident Fatality\n Prediction","summary":" Road traffic accidents (RTA) pose a significant public health threat\nworldwide, leading to considerable loss of life and economic burdens. This is\nparticularly acute in developing countries like Bangladesh. Building reliable\nmodels to forecast crash outcomes is crucial for implementing effective\npreventive measures. To aid in developing targeted safety interventions, this\nstudy presents a machine learning-based approach for classifying fatal and\nnon-fatal road accident outcomes using data from the Dhaka metropolitan traffic\ncrash database from 2017 to 2022. Our framework utilizes a range of machine\nlearning classification algorithms, comprising Logistic Regression, Support\nVector Machines, Naive Bayes, Random Forest, Decision Tree, Gradient Boosting,\nLightGBM, and Artificial Neural Network. We prioritize model interpretability\nby employing the SHAP (SHapley Additive exPlanations) method, which elucidates\nthe key factors influencing accident fatality. Our results demonstrate that\nLightGBM outperforms other models, achieving a ROC-AUC score of 0.72. The\nglobal, local, and feature dependency analyses are conducted to acquire deeper\ninsights into the behavior of the model. SHAP analysis reveals that casualty\nclass, time of accident, location, vehicle type, and road type play pivotal\nroles in determining fatality risk. These findings offer valuable insights for\npolicymakers and road safety practitioners in developing countries, enabling\nthe implementation of evidence-based strategies to reduce traffic crash\nfatalities.\n","authors":["Md. Asif Khan Rifat","Ahmedul Kabir","Armana Sabiha Huq"],"pdf_url":"https://arxiv.org/pdf/2409.11929v1.pdf","comment":"10 Pages, 6 figures, 2 tables, 28th International Conference on\n Knowledge-Based and Intelligent Information & Engineering Systems (KES 2024)"},{"id":"http://arxiv.org/abs/2408.11619v2","updated":"2024-09-18T12:39:36Z","published":"2024-08-21T13:46:58Z","title":"Data-driven Modeling of Combined Sewer Systems for Urban Sustainability:\n An Empirical Evaluation","summary":" Climate change poses complex challenges, with extreme weather events becoming\nincreasingly frequent and difficult to model. Examples include the dynamics of\nCombined Sewer Systems (CSS). Overburdened CSS during heavy rainfall will\noverflow untreated wastewater into surface water bodies. Classical approaches\nto modeling the impact of extreme rainfall events rely on physical simulations,\nwhich are particularly challenging to create for large urban infrastructures.\nDeep Learning (DL) models offer a cost-effective alternative for modeling the\ncomplex dynamics of sewer systems. In this study, we present a comprehensive\nempirical evaluation of several state-of-the-art DL time series models for\npredicting sewer system dynamics in a large urban infrastructure, utilizing\nthree years of measurement data. We especially investigate the potential of DL\nmodels to maintain predictive precision during network outages by comparing\nglobal models, which have access to all variables within the sewer system, and\nlocal models, which are limited to data from a restricted set of local sensors.\nOur findings demonstrate that DL models can accurately predict the dynamics of\nsewer system load, even under network outage conditions. These results suggest\nthat DL models can effectively aid in balancing the load redistribution in CSS,\nthereby enhancing the sustainability and resilience of urban infrastructures.\n","authors":["Vipin Singh","Tianheng Ling","Teodor Chiaburu","Felix Biessmann"],"pdf_url":"https://arxiv.org/pdf/2408.11619v2.pdf","comment":"8 pages, 4 figures, accepted at 47th German Conference on Artificial\n Intelligence, Wuerzburg 2024"},{"id":"http://arxiv.org/abs/2409.11920v1","updated":"2024-09-18T12:32:39Z","published":"2024-09-18T12:32:39Z","title":"Generation of Complex 3D Human Motion by Temporal and Spatial\n Composition of Diffusion Models","summary":" In this paper, we address the challenge of generating realistic 3D human\nmotions for action classes that were never seen during the training phase. Our\napproach involves decomposing complex actions into simpler movements,\nspecifically those observed during training, by leveraging the knowledge of\nhuman motion contained in GPTs models. These simpler movements are then\ncombined into a single, realistic animation using the properties of diffusion\nmodels. Our claim is that this decomposition and subsequent recombination of\nsimple movements can synthesize an animation that accurately represents the\ncomplex input action. This method operates during the inference phase and can\nbe integrated with any pre-trained diffusion model, enabling the synthesis of\nmotion classes not present in the training data. We evaluate our method by\ndividing two benchmark human motion datasets into basic and complex actions,\nand then compare its performance against the state-of-the-art.\n","authors":["Lorenzo Mandelli","Stefano Berretti"],"pdf_url":"https://arxiv.org/pdf/2409.11920v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.11902v1","updated":"2024-09-18T11:57:05Z","published":"2024-09-18T11:57:05Z","title":"Less Memory Means smaller GPUs: Backpropagation with Compressed\n Activations","summary":" The ever-growing scale of deep neural networks (DNNs) has lead to an equally\nrapid growth in computational resource requirements. Many recent architectures,\nmost prominently Large Language Models, have to be trained using supercomputers\nwith thousands of accelerators, such as GPUs or TPUs. Next to the vast number\nof floating point operations the memory footprint of DNNs is also exploding. In\ncontrast, GPU architectures are notoriously short on memory. Even comparatively\nsmall architectures like some EfficientNet variants cannot be trained on a\nsingle consumer-grade GPU at reasonable mini-batch sizes. During training,\nintermediate input activations have to be stored until backpropagation for\ngradient calculation. These make up the vast majority of the memory footprint.\nIn this work we therefore consider compressing activation maps for the backward\npass using pooling, which can reduce both the memory footprint and amount of\ndata movement. The forward computation remains uncompressed. We empirically\nshow convergence and study effects on feature detection at the example of the\ncommon vision architecture ResNet. With this approach we are able to reduce the\npeak memory consumption by 29% at the cost of a longer training schedule, while\nmaintaining prediction accuracy compared to an uncompressed baseline.\n","authors":["Daniel Barley","Holger Fröning"],"pdf_url":"https://arxiv.org/pdf/2409.11902v1.pdf","comment":"Presented at ITEM workshop co-located with ECML PKDD 2024, Vilnius LT"},{"id":"http://arxiv.org/abs/2209.05732v7","updated":"2024-09-18T11:52:04Z","published":"2022-09-13T04:58:35Z","title":"Rényi Divergence Deep Mutual Learning","summary":" This paper revisits Deep Mutual Learning (DML), a simple yet effective\ncomputing paradigm. We propose using R\\'{e}nyi divergence instead of the KL\ndivergence, which is more flexible and tunable, to improve vanilla DML. This\nmodification is able to consistently improve performance over vanilla DML with\nlimited additional complexity. The convergence properties of the proposed\nparadigm are analyzed theoretically, and Stochastic Gradient Descent with a\nconstant learning rate is shown to converge with $\\mathcal{O}(1)$-bias in the\nworst case scenario for nonconvex optimization tasks. That is, learning will\nreach nearby local optima but continue searching within a bounded scope, which\nmay help mitigate overfitting. Finally, our extensive empirical results\ndemonstrate the advantage of combining DML and R\\'{e}nyi divergence, leading to\nfurther improvement in model generalization.\n","authors":["Weipeng Huang","Junjie Tao","Changbo Deng","Ming Fan","Wenqiang Wan","Qi Xiong","Guangyuan Piao"],"pdf_url":"https://arxiv.org/pdf/2209.05732v7.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11899v1","updated":"2024-09-18T11:47:48Z","published":"2024-09-18T11:47:48Z","title":"Multi-Grid Graph Neural Networks with Self-Attention for Computational\n Mechanics","summary":" Advancement in finite element methods have become essential in various\ndisciplines, and in particular for Computational Fluid Dynamics (CFD), driving\nresearch efforts for improved precision and efficiency. While Convolutional\nNeural Networks (CNNs) have found success in CFD by mapping meshes into images,\nrecent attention has turned to leveraging Graph Neural Networks (GNNs) for\ndirect mesh processing. This paper introduces a novel model merging\nSelf-Attention with Message Passing in GNNs, achieving a 15\\% reduction in RMSE\non the well known flow past a cylinder benchmark. Furthermore, a dynamic mesh\npruning technique based on Self-Attention is proposed, that leads to a robust\nGNN-based multigrid approach, also reducing RMSE by 15\\%. Additionally, a new\nself-supervised training method based on BERT is presented, resulting in a 25\\%\nRMSE reduction. The paper includes an ablation study and outperforms\nstate-of-the-art models on several challenging datasets, promising advancements\nsimilar to those recently achieved in natural language and image processing.\nFinally, the paper introduces a dataset with meshes larger than existing ones\nby at least an order of magnitude. Code and Datasets will be released at\nhttps://github.com/DonsetPG/multigrid-gnn.\n","authors":["Paul Garnier","Jonathan Viquerat","Elie Hachem"],"pdf_url":"https://arxiv.org/pdf/2409.11899v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05680v3","updated":"2024-09-18T11:43:43Z","published":"2024-02-08T13:58:16Z","title":"Interpretable classifiers for tabular data via discretization and\n feature selection","summary":" We introduce a method for computing immediately human interpretable yet\naccurate classifiers from tabular data. The classifiers obtained are short\nBoolean formulas, computed via first discretizing the original data and then\nusing feature selection coupled with a very fast algorithm for producing the\nbest possible Boolean classifier for the setting. We demonstrate the approach\nvia 12 experiments, obtaining results with accuracies comparable to ones\nobtained via random forests, XGBoost, and existing results for the same\ndatasets in the literature. In most cases, the accuracy of our method is in\nfact similar to that of the reference methods, even though the main objective\nof our study is the immediate interpretability of our classifiers. We also\nprove a new result on the probability that the classifier we obtain from\nreal-life data corresponds to the ideally best classifier with respect to the\nbackground distribution the data comes from.\n","authors":["Reijo Jaakkola","Tomi Janhunen","Antti Kuusisto","Masood Feyzbakhsh Rankooh","Miikka Vilander"],"pdf_url":"https://arxiv.org/pdf/2402.05680v3.pdf","comment":"Preprint of a paper in DAO-XAI 2024 (Data meets Applied Ontologies in\n Explainable AI)"},{"id":"http://arxiv.org/abs/2409.11897v1","updated":"2024-09-18T11:43:07Z","published":"2024-09-18T11:43:07Z","title":"Secure Control Systems for Autonomous Quadrotors against Cyber-Attacks","summary":" The problem of safety for robotic systems has been extensively studied.\nHowever, little attention has been given to security issues for\nthree-dimensional systems, such as quadrotors. Malicious adversaries can\ncompromise robot sensors and communication networks, causing incidents,\nachieving illegal objectives, or even injuring people. This study first designs\nan intelligent control system for autonomous quadrotors. Then, it investigates\nthe problems of optimal false data injection attack scheduling and\ncountermeasure design for unmanned aerial vehicles. Using a state-of-the-art\ndeep learning-based approach, an optimal false data injection attack scheme is\nproposed to deteriorate a quadrotor's tracking performance with limited attack\nenergy. Subsequently, an optimal tracking control strategy is learned to\nmitigate attacks and recover the quadrotor's tracking performance. We base our\nwork on Agilicious, a state-of-the-art quadrotor recently deployed for\nautonomous settings. This paper is the first in the United Kingdom to deploy\nthis quadrotor and implement reinforcement learning on its platform. Therefore,\nto promote easy reproducibility with minimal engineering overhead, we further\nprovide (1) a comprehensive breakdown of this quadrotor, including software\nstacks and hardware alternatives; (2) a detailed reinforcement-learning\nframework to train autonomous controllers on Agilicious agents; and (3) a new\nopen-source environment that builds upon PyFlyt for future reinforcement\nlearning research on Agilicious platforms. Both simulated and real-world\nexperiments are conducted to show the effectiveness of the proposed frameworks\nin section 5.2.\n","authors":["Samuel Belkadi"],"pdf_url":"https://arxiv.org/pdf/2409.11897v1.pdf","comment":"The paper is based on an undergraduate thesis and is not intended for\n publication in a journal"},{"id":"http://arxiv.org/abs/2409.11862v1","updated":"2024-09-18T10:34:48Z","published":"2024-09-18T10:34:48Z","title":"Location based Probabilistic Load Forecasting of EV Charging Sites: Deep\n Transfer Learning with Multi-Quantile Temporal Convolutional Network","summary":" Electrification of vehicles is a potential way of reducing fossil fuel usage\nand thus lessening environmental pollution. Electric Vehicles (EVs) of various\ntypes for different transport modes (including air, water, and land) are\nevolving. Moreover, different EV user groups (commuters, commercial or domestic\nusers, drivers) may use different charging infrastructures (public, private,\nhome, and workplace) at various times. Therefore, usage patterns and energy\ndemand are very stochastic. Characterizing and forecasting the charging demand\nof these diverse EV usage profiles is essential in preventing power outages.\nPreviously developed data-driven load models are limited to specific use cases\nand locations. None of these models are simultaneously adaptive enough to\ntransfer knowledge of day-ahead forecasting among EV charging sites of diverse\nlocations, trained with limited data, and cost-effective. This article presents\na location-based load forecasting of EV charging sites using a deep\nMulti-Quantile Temporal Convolutional Network (MQ-TCN) to overcome the\nlimitations of earlier models. We conducted our experiments on data from four\ncharging sites, namely Caltech, JPL, Office-1, and NREL, which have diverse EV\nuser types like students, full-time and part-time employees, random visitors,\netc. With a Prediction Interval Coverage Probability (PICP) score of 93.62\\%,\nour proposed deep MQ-TCN model exhibited a remarkable 28.93\\% improvement over\nthe XGBoost model for a day-ahead load forecasting at the JPL charging site. By\ntransferring knowledge with the inductive Transfer Learning (TL) approach, the\nMQ-TCN model achieved a 96.88\\% PICP score for the load forecasting task at the\nNREL site using only two weeks of data.\n","authors":["Mohammad Wazed Ali","Asif bin Mustafa","Md. Aukerul Moin Shuvo","Bernhard Sick"],"pdf_url":"https://arxiv.org/pdf/2409.11862v1.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2409.11859v1","updated":"2024-09-18T10:28:28Z","published":"2024-09-18T10:28:28Z","title":"Tight and Efficient Upper Bound on Spectral Norm of Convolutional Layers","summary":" Controlling the spectral norm of the Jacobian matrix, which is related to the\nconvolution operation, has been shown to improve generalization, training\nstability and robustness in CNNs. Existing methods for computing the norm\neither tend to overestimate it or their performance may deteriorate quickly\nwith increasing the input and kernel sizes. In this paper, we demonstrate that\nthe tensor version of the spectral norm of a four-dimensional convolution\nkernel, up to a constant factor, serves as an upper bound for the spectral norm\nof the Jacobian matrix associated with the convolution operation. This new\nupper bound is independent of the input image resolution, differentiable and\ncan be efficiently calculated during training. Through experiments, we\ndemonstrate how this new bound can be used to improve the performance of\nconvolutional architectures.\n","authors":["Ekaterina Grishina","Mikhail Gorbunov","Maxim Rakhuba"],"pdf_url":"https://arxiv.org/pdf/2409.11859v1.pdf","comment":"ECCV 2024"},{"id":"http://arxiv.org/abs/2409.11856v1","updated":"2024-09-18T10:24:42Z","published":"2024-09-18T10:24:42Z","title":"Edge-Based Graph Component Pooling","summary":" Graph-structured data naturally occurs in many research fields, such as\nchemistry and sociology. The relational information contained therein can be\nleveraged to statistically model graph properties through geometrical deep\nlearning. Graph neural networks employ techniques, such as message-passing\nlayers, to propagate local features through a graph. However, message-passing\nlayers can be computationally expensive when dealing with large and sparse\ngraphs. Graph pooling operators offer the possibility of removing or merging\nnodes in such graphs, thus lowering computational costs. However, pooling\noperators that remove nodes cause data loss, and pooling operators that merge\nnodes are often computationally expensive. We propose a pooling operator that\nmerges nodes so as not to cause data loss but is also conceptually simple and\ncomputationally inexpensive. We empirically demonstrate that the proposed\npooling operator performs statistically significantly better than edge pool on\nfour popular benchmark datasets while reducing time complexity and the number\nof trainable parameters by 70.6% on average. Compared to another maximally\npowerful method named Graph Isomporhic Network, we show that we outperform them\non two popular benchmark datasets while reducing the number of learnable\nparameters on average by 60.9%.\n","authors":["T. Snelleman","B. M. Renting","H. H. Hoos","J. N. van Rijn"],"pdf_url":"https://arxiv.org/pdf/2409.11856v1.pdf","comment":"15 pages, presented at 21st International Workshop on Mining and\n Learning with Graphs, AstraZenica Bio & Healthcare award Paper, ECML PKDD\n 2024 Vilnius"},{"id":"http://arxiv.org/abs/2405.13031v2","updated":"2024-09-18T10:06:58Z","published":"2024-05-16T10:45:43Z","title":"A Robust Autoencoder Ensemble-Based Approach for Anomaly Detection in\n Text","summary":" Anomaly detection (AD) is a fast growing and popular domain among established\napplications like vision and time series. We observe a rich literature for\nthese applications, but anomaly detection in text is only starting to blossom.\nRecently, self-supervised methods with self-attention mechanism have been the\nmost popular choice. While recent works have proposed a working ground for\nbuilding and benchmarking state of the art approaches, we propose two principal\ncontributions in this paper: contextual anomaly contamination and a novel\nensemble-based approach. Our method, Textual Anomaly Contamination (TAC),\nallows to contaminate inlier classes with either independent or contextual\nanomalies. In the literature, it appears that this distinction is not\nperformed. For finding contextual anomalies, we propose RoSAE, a Robust\nSubspace Local Recovery Autoencoder Ensemble. All autoencoders of the ensemble\npresent a different latent representation through local manifold learning.\nBenchmark shows that our approach outperforms recent works on both independent\nand contextual anomalies, while being more robust. We also provide 8 dataset\ncomparison instead of only relying to Reuters and 20 Newsgroups corpora.\n","authors":["Jeremie Pantin","Christophe Marsala"],"pdf_url":"https://arxiv.org/pdf/2405.13031v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.12907v2","updated":"2024-09-18T10:04:57Z","published":"2024-06-12T13:30:48Z","title":"Reconciling Kaplan and Chinchilla Scaling Laws","summary":" Kaplan et al. [2020] (`Kaplan') and Hoffmann et al. [2022] (`Chinchilla')\nstudied the scaling behavior of transformers trained on next-token language\nprediction. These studies produced different estimates for how the number of\nparameters ($N$) and training tokens ($D$) should be set to achieve the lowest\npossible loss for a given compute budget ($C$). Kaplan: $N_\\text{optimal}\n\\propto C^{0.73}$, Chinchilla: $N_\\text{optimal} \\propto C^{0.50}$. This paper\nfinds that much of this discrepancy can be attributed to Kaplan counting\nnon-embedding rather than total parameters, combined with their analysis being\nperformed at small scale. Simulating the Chinchilla study under these\nconditions produces biased scaling coefficients close to Kaplan's. Hence, this\npaper reaffirms Chinchilla's scaling coefficients, by explaining the primary\ncause of Kaplan's original overestimation. As a second contribution, the paper\nexplains differences in the reported relationships between loss and compute.\nThese findings lead us to recommend that future scaling studies use total\nparameters and compute.\n","authors":["Tim Pearce","Jinyeop Song"],"pdf_url":"https://arxiv.org/pdf/2406.12907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.08069v3","updated":"2024-09-18T10:04:56Z","published":"2024-06-12T10:39:31Z","title":"Explore-Go: Leveraging Exploration for Generalisation in Deep\n Reinforcement Learning","summary":" One of the remaining challenges in reinforcement learning is to develop\nagents that can generalise to novel scenarios they might encounter once\ndeployed. This challenge is often framed in a multi-task setting where agents\ntrain on a fixed set of tasks and have to generalise to new tasks. Recent work\nhas shown that in this setting increased exploration during training can be\nleveraged to increase the generalisation performance of the agent. This makes\nsense when the states encountered during testing can actually be explored\nduring training. In this paper, we provide intuition why exploration can also\nbenefit generalisation to states that cannot be explicitly encountered during\ntraining. Additionally, we propose a novel method Explore-Go that exploits this\nintuition by increasing the number of states on which the agent trains.\nExplore-Go effectively increases the starting state distribution of the agent\nand as a result can be used in conjunction with most existing on-policy or\noff-policy reinforcement learning algorithms. We show empirically that our\nmethod can increase generalisation performance in an illustrative environment\nand on the Procgen benchmark.\n","authors":["Max Weltevrede","Felix Kaubek","Matthijs T. J. Spaan","Wendelin Böhmer"],"pdf_url":"https://arxiv.org/pdf/2406.08069v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11847v1","updated":"2024-09-18T10:01:37Z","published":"2024-09-18T10:01:37Z","title":"An efficient wavelet-based physics-informed neural networks for\n singularly perturbed problems","summary":" Physics-informed neural networks (PINNs) are a class of deep learning models\nthat utilize physics as differential equations to address complex problems,\nincluding ones that may involve limited data availability. However, tackling\nsolutions of differential equations with oscillations or singular perturbations\nand shock-like structures becomes challenging for PINNs. Considering these\nchallenges, we designed an efficient wavelet-based PINNs (W-PINNs) model to\nsolve singularly perturbed differential equations. Here, we represent the\nsolution in wavelet space using a family of smooth-compactly supported\nwavelets. This framework represents the solution of a differential equation\nwith significantly fewer degrees of freedom while still retaining in capturing,\nidentifying, and analyzing the local structure of complex physical phenomena.\nThe architecture allows the training process to search for a solution within\nwavelet space, making the process faster and more accurate. The proposed model\ndoes not rely on automatic differentiations for derivatives involved in\ndifferential equations and does not require any prior information regarding the\nbehavior of the solution, such as the location of abrupt features. Thus,\nthrough a strategic fusion of wavelets with PINNs, W-PINNs excel at capturing\nlocalized nonlinear information, making them well-suited for problems showing\nabrupt behavior in certain regions, such as singularly perturbed problems. The\nefficiency and accuracy of the proposed neural network model are demonstrated\nin various test problems, i.e., highly singularly perturbed nonlinear\ndifferential equations, the FitzHugh-Nagumo (FHN), and Predator-prey\ninteraction models. The proposed design model exhibits impressive comparisons\nwith traditional PINNs and the recently developed wavelet-based PINNs, which\nuse wavelets as an activation function for solving nonlinear differential\nequations.\n","authors":["Himanshu Pandey","Anshima Singh","Ratikanta Behera"],"pdf_url":"https://arxiv.org/pdf/2409.11847v1.pdf","comment":"17 pages, 12 figures"},{"id":"http://arxiv.org/abs/2409.11843v1","updated":"2024-09-18T09:53:13Z","published":"2024-09-18T09:53:13Z","title":"Graph Neural Network-State Predictive Information Bottleneck (GNN-SPIB)\n approach for learning molecular thermodynamics and kinetics","summary":" Molecular dynamics simulations offer detailed insights into atomic motions\nbut face timescale limitations. Enhanced sampling methods have addressed these\nchallenges but even with machine learning, they often rely on pre-selected\nexpert-based features. In this work, we present the Graph Neural Network-State\nPredictive Information Bottleneck (GNN-SPIB) framework, which combines graph\nneural networks and the State Predictive Information Bottleneck to\nautomatically learn low-dimensional representations directly from atomic\ncoordinates. Tested on three benchmark systems, our approach predicts essential\nstructural, thermodynamic and kinetic information for slow processes,\ndemonstrating robustness across diverse systems. The method shows promise for\ncomplex systems, enabling effective enhanced sampling without requiring\npre-defined reaction coordinates or input features.\n","authors":["Ziyue Zou","Dedi Wang","Pratyush Tiwary"],"pdf_url":"https://arxiv.org/pdf/2409.11843v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11831v1","updated":"2024-09-18T09:30:03Z","published":"2024-09-18T09:30:03Z","title":"RaggeDi: Diffusion-based State Estimation of Disordered Rags, Sheets,\n Towels and Blankets","summary":" Cloth state estimation is an important problem in robotics. It is essential\nfor the robot to know the accurate state to manipulate cloth and execute tasks\nsuch as robotic dressing, stitching, and covering/uncovering human beings.\nHowever, estimating cloth state accurately remains challenging due to its high\nflexibility and self-occlusion. This paper proposes a diffusion model-based\npipeline that formulates the cloth state estimation as an image generation\nproblem by representing the cloth state as an RGB image that describes the\npoint-wise translation (translation map) between a pre-defined flattened mesh\nand the deformed mesh in a canonical space. Then we train a conditional\ndiffusion-based image generation model to predict the translation map based on\nan observation. Experiments are conducted in both simulation and the real world\nto validate the performance of our method. Results indicate that our method\noutperforms two recent methods in both accuracy and speed.\n","authors":["Jikai Ye","Wanze Li","Shiraz Khan","Gregory S. Chirikjian"],"pdf_url":"https://arxiv.org/pdf/2409.11831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.05477v3","updated":"2024-09-18T09:15:10Z","published":"2024-09-09T10:11:25Z","title":"Retrofitting Temporal Graph Neural Networks with Transformer","summary":" Temporal graph neural networks (TGNNs) outperform regular GNNs by\nincorporating time information into graph-based operations. However, TGNNs\nadopt specialized models (e.g., TGN, TGAT, and APAN ) and require tailored\ntraining frameworks (e.g., TGL and ETC). In this paper, we propose TF-TGN,\nwhich uses Transformer decoder as the backbone model for TGNN to enjoy\nTransformer's codebase for efficient training. In particular, Transformer\nachieves tremendous success for language modeling, and thus the community\ndeveloped high-performance kernels (e.g., flash-attention and memory-efficient\nattention) and efficient distributed training schemes (e.g., PyTorch FSDP,\nDeepSpeed, and Megatron-LM). We observe that TGNN resembles language modeling,\ni.e., the message aggregation operation between chronologically occurring nodes\nand their temporal neighbors in TGNNs can be structured as sequence modeling.\nBeside this similarity, we also incorporate a series of algorithm designs\nincluding suffix infilling, temporal graph attention with self-loop, and causal\nmasking self-attention to make TF-TGN work. During training, existing systems\nare slow in transforming the graph topology and conducting graph sampling. As\nsuch, we propose methods to parallelize the CSR format conversion and graph\nsampling. We also adapt Transformer codebase to train TF-TGN efficiently with\nmultiple GPUs. We experiment with 9 graphs and compare with 2 state-of-the-art\nTGNN training frameworks. The results show that TF-TGN can accelerate training\nby over 2.20 while providing comparable or even superior accuracy to existing\nSOTA TGNNs. TF-TGN is available at https://github.com/qianghuangwhu/TF-TGN.\n","authors":["Qiang Huang","Xiao Yan","Xin Wang","Susie Xi Rao","Zhichao Han","Fangcheng Fu","Wentao Zhang","Jiawei Jiang"],"pdf_url":"https://arxiv.org/pdf/2409.05477v3.pdf","comment":"conference Under review"},{"id":"http://arxiv.org/abs/2409.11820v1","updated":"2024-09-18T09:12:40Z","published":"2024-09-18T09:12:40Z","title":"Optimizing Job Shop Scheduling in the Furniture Industry: A\n Reinforcement Learning Approach Considering Machine Setup, Batch Variability,\n and Intralogistics","summary":" This paper explores the potential application of Deep Reinforcement Learning\nin the furniture industry. To offer a broad product portfolio, most furniture\nmanufacturers are organized as a job shop, which ultimately results in the Job\nShop Scheduling Problem (JSSP). The JSSP is addressed with a focus on extending\ntraditional models to better represent the complexities of real-world\nproduction environments. Existing approaches frequently fail to consider\ncritical factors such as machine setup times or varying batch sizes. A concept\nfor a model is proposed that provides a higher level of information detail to\nenhance scheduling accuracy and efficiency. The concept introduces the\nintegration of DRL for production planning, particularly suited to batch\nproduction industries such as the furniture industry. The model extends\ntraditional approaches to JSSPs by including job volumes, buffer management,\ntransportation times, and machine setup times. This enables more precise\nforecasting and analysis of production flows and processes, accommodating the\nvariability and complexity inherent in real-world manufacturing processes. The\nRL agent learns to optimize scheduling decisions. It operates within a discrete\naction space, making decisions based on detailed observations. A reward\nfunction guides the agent's decision-making process, thereby promoting\nefficient scheduling and meeting production deadlines. Two integration\nstrategies for implementing the RL agent are discussed: episodic planning,\nwhich is suitable for low-automation environments, and continuous planning,\nwhich is ideal for highly automated plants. While episodic planning can be\nemployed as a standalone solution, the continuous planning approach\nnecessitates the integration of the agent with ERP and Manufacturing Execution\nSystems. This integration enables real-time adjustments to production schedules\nbased on dynamic changes.\n","authors":["Malte Schneevogt","Karsten Binninger","Noah Klarmann"],"pdf_url":"https://arxiv.org/pdf/2409.11820v1.pdf","comment":"18 pages, 8 pages"},{"id":"http://arxiv.org/abs/2409.11808v1","updated":"2024-09-18T08:52:30Z","published":"2024-09-18T08:52:30Z","title":"Accelerating the Training and Improving the Reliability of\n Machine-Learned Interatomic Potentials for Strongly Anharmonic Materials\n through Active Learning","summary":" Molecular dynamics (MD) employing machine-learned interatomic potentials\n(MLIPs) serve as an efficient, urgently needed complement to ab initio\nmolecular dynamics (aiMD). By training these potentials on data generated from\nab initio methods, their averaged predictions can exhibit comparable\nperformance to ab initio methods at a fraction of the cost. However,\ninsufficient training sets might lead to an improper description of the\ndynamics in strongly anharmonic materials, because critical effects might be\noverlooked in relevant cases, or only incorrectly captured, or hallucinated by\nthe MLIP when they are not actually present. In this work, we show that an\nactive learning scheme that combines MD with MLIPs (MLIP-MD) and uncertainty\nestimates can avoid such problematic predictions. In short, efficient MLIP-MD\nis used to explore configuration space quickly, whereby an acquisition function\nbased on uncertainty estimates and on energetic viability is employed to\nmaximize the value of the newly generated data and to focus on the most\nunfamiliar but reasonably accessible regions of phase space. To verify our\nmethodology, we screen over 112 materials and identify 10 examples experiencing\nthe aforementioned problems. Using CuI and AgGaSe$_2$ as archetypes for these\nproblematic materials, we discuss the physical implications for strongly\nanharmonic effects and demonstrate how the developed active learning scheme can\naddress these issues.\n","authors":["Kisung Kang","Thomas A. R. Purcell","Christian Carbogno","Matthias Scheffler"],"pdf_url":"https://arxiv.org/pdf/2409.11808v1.pdf","comment":"15 pages, 13 figures"},{"id":"http://arxiv.org/abs/2409.11807v1","updated":"2024-09-18T08:48:54Z","published":"2024-09-18T08:48:54Z","title":"Constraint Guided AutoEncoders for Joint Optimization of Condition\n Indicator Estimation and Anomaly Detection in Machine Condition Monitoring","summary":" The main goal of machine condition monitoring is, as the name implies, to\nmonitor the condition of industrial applications. The objective of this\nmonitoring can be mainly split into two problems. A diagnostic problem, where\nnormal data should be distinguished from anomalous data, otherwise called\nAnomaly Detection (AD), or a prognostic problem, where the aim is to predict\nthe evolution of a Condition Indicator (CI) that reflects the condition of an\nasset throughout its life time. When considering machine condition monitoring,\nit is expected that this CI shows a monotonic behavior, as the condition of a\nmachine gradually degrades over time. This work proposes an extension to\nConstraint Guided AutoEncoders (CGAE), which is a robust AD method, that\nenables building a single model that can be used for both AD and CI estimation.\nFor the purpose of improved CI estimation the extension incorporates a\nconstraint that enforces the model to have monotonically increasing CI\npredictions over time. Experimental results indicate that the proposed\nalgorithm performs similar, or slightly better, than CGAE, with regards to AD,\nwhile improving the monotonic behavior of the CI.\n","authors":["Maarten Meire","Quinten Van Baelen","Ted Ooijevaar","Peter Karsmakers"],"pdf_url":"https://arxiv.org/pdf/2409.11807v1.pdf","comment":"32 pages, 7 figures, 4 tables"},{"id":"http://arxiv.org/abs/2210.02215v3","updated":"2024-09-18T08:48:11Z","published":"2022-10-05T12:55:53Z","title":"On the Statistical Complexity of Estimation and Testing under Privacy\n Constraints","summary":" The challenge of producing accurate statistics while respecting the privacy\nof the individuals in a sample is an important area of research. We study\nminimax lower bounds for classes of differentially private estimators. In\nparticular, we show how to characterize the power of a statistical test under\ndifferential privacy in a plug-and-play fashion by solving an appropriate\ntransport problem. With specific coupling constructions, this observation\nallows us to derive Le Cam-type and Fano-type inequalities not only for regular\ndefinitions of differential privacy but also for those based on Renyi\ndivergence. We then proceed to illustrate our results on three simple, fully\nworked out examples. In particular, we show that the problem class has a huge\nimportance on the provable degradation of utility due to privacy. In certain\nscenarios, we show that maintaining privacy results in a noticeable reduction\nin performance only when the level of privacy protection is very high.\nConversely, for other problems, even a modest level of privacy protection can\nlead to a significant decrease in performance. Finally, we demonstrate that the\nDP-SGLD algorithm, a private convex solver, can be employed for maximum\nlikelihood estimation with a high degree of confidence, as it provides\nnear-optimal results with respect to both the size of the sample and the level\nof privacy protection. This algorithm is applicable to a broad range of\nparametric estimation procedures, including exponential families.\n","authors":["Clément Lalanne","Aurélien Garivier","Rémi Gribonval"],"pdf_url":"https://arxiv.org/pdf/2210.02215v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11798v1","updated":"2024-09-18T08:30:20Z","published":"2024-09-18T08:30:20Z","title":"The Factuality of Large Language Models in the Legal Domain","summary":" This paper investigates the factuality of large language models (LLMs) as\nknowledge bases in the legal domain, in a realistic usage scenario: we allow\nfor acceptable variations in the answer, and let the model abstain from\nanswering when uncertain. First, we design a dataset of diverse factual\nquestions about case law and legislation. We then use the dataset to evaluate\nseveral LLMs under different evaluation methods, including exact, alias, and\nfuzzy matching. Our results show that the performance improves significantly\nunder the alias and fuzzy matching methods. Further, we explore the impact of\nabstaining and in-context examples, finding that both strategies enhance\nprecision. Finally, we demonstrate that additional pre-training on legal\ndocuments, as seen with SaulLM, further improves factual precision from 63% to\n81%.\n","authors":["Rajaa El Hamdani","Thomas Bonald","Fragkiskos Malliaros","Nils Holzenberger","Fabian Suchanek"],"pdf_url":"https://arxiv.org/pdf/2409.11798v1.pdf","comment":"CIKM 2024, short paper"},{"id":"http://arxiv.org/abs/2406.15540v2","updated":"2024-09-18T08:21:29Z","published":"2024-06-21T17:39:57Z","title":"Specify What? Enhancing Neural Specification Synthesis by Symbolic\n Methods","summary":" We investigate how combinations of Large Language Models (LLMs) and symbolic\nanalyses can be used to synthesise specifications of C programs. The LLM\nprompts are augmented with outputs from two formal methods tools in the Frama-C\necosystem, Pathcrawler and EVA, to produce C program annotations in the\nspecification language ACSL. We demonstrate how the addition of symbolic\nanalysis to the workflow impacts the quality of annotations: information about\ninput/output examples from Pathcrawler produce more context-aware annotations,\nwhile the inclusion of EVA reports yields annotations more attuned to runtime\nerrors. In addition, we show that the method infers rather the programs intent\nthan its behaviour, by generating specifications for buggy programs and\nobserving robustness of the result against bugs.\n","authors":["George Granberry","Wolfgang Ahrendt","Moa Johansson"],"pdf_url":"https://arxiv.org/pdf/2406.15540v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11772v1","updated":"2024-09-18T07:52:33Z","published":"2024-09-18T07:52:33Z","title":"Symmetry-Based Structured Matrices for Efficient Approximately\n Equivariant Networks","summary":" There has been much recent interest in designing symmetry-aware neural\nnetworks (NNs) exhibiting relaxed equivariance. Such NNs aim to interpolate\nbetween being exactly equivariant and being fully flexible, affording\nconsistent performance benefits. In a separate line of work, certain structured\nparameter matrices -- those with displacement structure, characterized by low\ndisplacement rank (LDR) -- have been used to design small-footprint NNs.\nDisplacement structure enables fast function and gradient evaluation, but\npermits accurate approximations via compression primarily to classical\nconvolutional neural networks (CNNs). In this work, we propose a general\nframework -- based on a novel construction of symmetry-based structured\nmatrices -- to build approximately equivariant NNs with significantly reduced\nparameter counts. Our framework integrates the two aforementioned lines of work\nvia the use of so-called Group Matrices (GMs), a forgotten precursor to the\nmodern notion of regular representations of finite groups. GMs allow the design\nof structured matrices -- resembling LDR matrices -- which generalize the\nlinear operations of a classical CNN from cyclic groups to general finite\ngroups and their homogeneous spaces. We show that GMs can be employed to extend\nall the elementary operations of CNNs to general discrete groups. Further, the\ntheory of structured matrices based on GMs provides a generalization of LDR\ntheory focussed on matrices with cyclic structure, providing a tool for\nimplementing approximate equivariance for discrete groups. We test GM-based\narchitectures on a variety of tasks in the presence of relaxed symmetry. We\nreport that our framework consistently performs competitively compared to\napproximately equivariant NNs, and other structured matrix-based compression\nframeworks, sometimes with a one or two orders of magnitude lower parameter\ncount.\n","authors":["Ashwin Samudre","Mircea Petrache","Brian D. Nord","Shubhendu Trivedi"],"pdf_url":"https://arxiv.org/pdf/2409.11772v1.pdf","comment":"20 pages"},{"id":"http://arxiv.org/abs/2401.12708v2","updated":"2024-09-18T07:48:33Z","published":"2024-01-23T12:15:47Z","title":"Deep Neural Network Benchmarks for Selective Classification","summary":" With the increasing deployment of machine learning models in many socially\nsensitive tasks, there is a growing demand for reliable and trustworthy\npredictions. One way to accomplish these requirements is to allow a model to\nabstain from making a prediction when there is a high risk of making an error.\nThis requires adding a selection mechanism to the model, which selects those\nexamples for which the model will provide a prediction. The selective\nclassification framework aims to design a mechanism that balances the fraction\nof rejected predictions (i.e., the proportion of examples for which the model\ndoes not make a prediction) versus the improvement in predictive performance on\nthe selected predictions. Multiple selective classification frameworks exist,\nmost of which rely on deep neural network architectures. However, the empirical\nevaluation of the existing approaches is still limited to partial comparisons\namong methods and settings, providing practitioners with little insight into\ntheir relative merits. We fill this gap by benchmarking 18 baselines on a\ndiverse set of 44 datasets that includes both image and tabular data. Moreover,\nthere is a mix of binary and multiclass tasks. We evaluate these approaches\nusing several criteria, including selective error rate, empirical coverage,\ndistribution of rejected instance's classes, and performance on\nout-of-distribution instances. The results indicate that there is not a single\nclear winner among the surveyed baselines, and the best method depends on the\nusers' objectives.\n","authors":["Andrea Pugnana","Lorenzo Perini","Jesse Davis","Salvatore Ruggieri"],"pdf_url":"https://arxiv.org/pdf/2401.12708v2.pdf","comment":"Published in The Journal of Data centric Machine Learning Research\n (DMLR), Vol 1, (17):1-58 (2024)"},{"id":"http://arxiv.org/abs/2409.06349v2","updated":"2024-09-18T07:44:13Z","published":"2024-09-10T09:07:47Z","title":"Improving Conditional Level Generation using Automated Validation in\n Match-3 Games","summary":" Generative models for level generation have shown great potential in game\nproduction. However, they often provide limited control over the generation,\nand the validity of the generated levels is unreliable. Despite this fact, only\na few approaches that learn from existing data provide the users with ways of\ncontrolling the generation, simultaneously addressing the generation of\nunsolvable levels. %One of the main challenges it faces is that levels\ngenerated through automation may not be solvable thus requiring validation. are\nnot always engaging, challenging, or even solvable. This paper proposes Avalon,\na novel method to improve models that learn from existing level designs using\ndifficulty statistics extracted from gameplay. In particular, we use a\nconditional variational autoencoder to generate layouts for match-3 levels,\nconditioning the model on pre-collected statistics such as game mechanics like\ndifficulty and relevant visual features like size and symmetry. Our method is\ngeneral enough that multiple approaches could potentially be used to generate\nthese statistics. We quantitatively evaluate our approach by comparing it to an\nablated model without difficulty conditioning. Additionally, we analyze both\nquantitatively and qualitatively whether the style of the dataset is preserved\nin the generated levels. Our approach generates more valid levels than the same\nmethod without difficulty conditioning.\n","authors":["Monica Villanueva Aylagas","Joakim Bergdahl","Jonas Gillberg","Alessandro Sestini","Theodor Tolstoy","Linus Gisslén"],"pdf_url":"https://arxiv.org/pdf/2409.06349v2.pdf","comment":"10 pages, 5 figures, 2 tables"},{"id":"http://arxiv.org/abs/2409.04464v2","updated":"2024-09-18T07:43:12Z","published":"2024-09-03T07:25:01Z","title":"Leveraging Large Language Models for Solving Rare MIP Challenges","summary":" Mixed Integer Programming (MIP) has been extensively applied in areas\nrequiring mathematical solvers to address complex instances within tight time\nconstraints. However, as the problem scale increases, the complexity of model\nformulation and finding feasible solutions escalates significantly. In\ncontrast, the model-building cost for end-to-end models, such as large language\nmodels (LLMs), remains largely unaffected by problem scale due to their pattern\nrecognition capabilities. While LLMs, like GPT-4, without fine-tuning, can\nhandle some traditional medium-scale MIP problems, they struggle with uncommon\nor highly specialized MIP scenarios. Fine-tuning LLMs can yield some feasible\nsolutions for medium-scale MIP instances, but these models typically fail to\nexplore diverse solutions when constrained by a low and constant temperature,\nlimiting their performance. In this paper, we propose and evaluate a\nrecursively dynamic temperature method integrated with a chain-of-thought\napproach. Our findings show that starting with a high temperature and gradually\nlowering it leads to better feasible solutions compared to other dynamic\ntemperature strategies. Additionally, by comparing results generated by the LLM\nwith those from Gurobi, we demonstrate that the LLM can produce solutions that\ncomplement traditional solvers by accelerating the pruning process and\nimproving overall efficiency.\n","authors":["Teng Wang","Wing-Yin Yu","Ruifeng She","Wenhan Yang","Taijie Chen","Jianping Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.04464v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10964v2","updated":"2024-09-18T07:37:31Z","published":"2024-09-17T08:01:58Z","title":"Active learning for energy-based antibody optimization and enhanced\n screening","summary":" Accurate prediction and optimization of protein-protein binding affinity is\ncrucial for therapeutic antibody development. Although machine learning-based\nprediction methods $\\Delta\\Delta G$ are suitable for large-scale mutant\nscreening, they struggle to predict the effects of multiple mutations for\ntargets without existing binders. Energy function-based methods, though more\naccurate, are time consuming and not ideal for large-scale screening. To\naddress this, we propose an active learning workflow that efficiently trains a\ndeep learning model to learn energy functions for specific targets, combining\nthe advantages of both approaches. Our method integrates the RDE-Network deep\nlearning model with Rosetta's energy function-based Flex ddG to efficiently\nexplore mutants. In a case study targeting HER2-binding Trastuzumab mutants,\nour approach significantly improved the screening performance over random\nselection and demonstrated the ability to identify mutants with better binding\nproperties without experimental $\\Delta\\Delta G$ data. This workflow advances\ncomputational antibody design by combining machine learning, physics-based\ncomputations, and active learning to achieve more efficient antibody\ndevelopment.\n","authors":["Kairi Furui","Masahito Ohue"],"pdf_url":"https://arxiv.org/pdf/2409.10964v2.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2409.11761v1","updated":"2024-09-18T07:36:25Z","published":"2024-09-18T07:36:25Z","title":"Consistent Estimation of a Class of Distances Between Covariance\n Matrices","summary":" This work considers the problem of estimating the distance between two\ncovariance matrices directly from the data. Particularly, we are interested in\nthe family of distances that can be expressed as sums of traces of functions\nthat are separately applied to each covariance matrix. This family of distances\nis particularly useful as it takes into consideration the fact that covariance\nmatrices lie in the Riemannian manifold of positive definite matrices, thereby\nincluding a variety of commonly used metrics, such as the Euclidean distance,\nJeffreys' divergence, and the log-Euclidean distance. Moreover, a statistical\nanalysis of the asymptotic behavior of this class of distance estimators has\nalso been conducted. Specifically, we present a central limit theorem that\nestablishes the asymptotic Gaussianity of these estimators and provides closed\nform expressions for the corresponding means and variances. Empirical\nevaluations demonstrate the superiority of our proposed consistent estimator\nover conventional plug-in estimators in multivariate analytical contexts.\nAdditionally, the central limit theorem derived in this study provides a robust\nstatistical framework to assess of accuracy of these estimators.\n","authors":["Roberto Pereira","Xavier Mestre","Davig Gregoratti"],"pdf_url":"https://arxiv.org/pdf/2409.11761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10640v2","updated":"2024-09-18T07:35:46Z","published":"2024-09-16T18:15:28Z","title":"Exploring Fine-tuned Generative Models for Keyphrase Selection: A Case\n Study for Russian","summary":" Keyphrase selection plays a pivotal role within the domain of scholarly\ntexts, facilitating efficient information retrieval, summarization, and\nindexing. In this work, we explored how to apply fine-tuned generative\ntransformer-based models to the specific task of keyphrase selection within\nRussian scientific texts. We experimented with four distinct generative models,\nsuch as ruT5, ruGPT, mT5, and mBART, and evaluated their performance in both\nin-domain and cross-domain settings. The experiments were conducted on the\ntexts of Russian scientific abstracts from four domains: mathematics & computer\nscience, history, medicine, and linguistics. The use of generative models,\nnamely mBART, led to gains in in-domain performance (up to 4.9% in BERTScore,\n9.0% in ROUGE-1, and 12.2% in F1-score) over three keyphrase extraction\nbaselines for the Russian language. Although the results for cross-domain usage\nwere significantly lower, they still demonstrated the capability to surpass\nbaseline performances in several cases, underscoring the promising potential\nfor further exploration and refinement in this research field.\n","authors":["Anna Glazkova","Dmitry Morozov"],"pdf_url":"https://arxiv.org/pdf/2409.10640v2.pdf","comment":"DAMDID-2024"},{"id":"http://arxiv.org/abs/2407.10688v2","updated":"2024-09-18T07:27:51Z","published":"2024-07-15T13:01:47Z","title":"Probability Passing for Graph Neural Networks: Graph Structure and\n Representations Joint Learning","summary":" Graph Neural Networks (GNNs) have achieved notable success in the analysis of\nnon-Euclidean data across a wide range of domains. However, their applicability\nis constrained by the dependence on the observed graph structure. To solve this\nproblem, Latent Graph Inference (LGI) is proposed to infer a task-specific\nlatent structure by computing similarity or edge probability of node features\nand then apply a GNN to produce predictions. Even so, existing approaches\nneglect the noise from node features, which affects generated graph structure\nand performance. In this work, we introduce a novel method called Probability\nPassing to refine the generated graph structure by aggregating edge\nprobabilities of neighboring nodes based on observed graph. Furthermore, we\ncontinue to utilize the LGI framework, inputting the refined graph structure\nand node features into GNNs to obtain predictions. We name the proposed scheme\nas Probability Passing-based Graph Neural Network (PPGNN). Moreover, the\nanchor-based technique is employed to reduce complexity and improve efficiency.\nExperimental results demonstrate the effectiveness of the proposed method.\n","authors":["Ziyan Wang","Yaxuan He","Bin Liu"],"pdf_url":"https://arxiv.org/pdf/2407.10688v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11754v1","updated":"2024-09-18T07:18:22Z","published":"2024-09-18T07:18:22Z","title":"NPAT Null-Space Projected Adversarial Training Towards Zero\n Deterioration","summary":" To mitigate the susceptibility of neural networks to adversarial attacks,\nadversarial training has emerged as a prevalent and effective defense strategy.\nIntrinsically, this countermeasure incurs a trade-off, as it sacrifices the\nmodel's accuracy in processing normal samples. To reconcile the trade-off, we\npioneer the incorporation of null-space projection into adversarial training\nand propose two innovative Null-space Projection based Adversarial\nTraining(NPAT) algorithms tackling sample generation and gradient optimization,\nnamed Null-space Projected Data Augmentation (NPDA) and Null-space Projected\nGradient Descent (NPGD), to search for an overarching optimal solutions, which\nenhance robustness with almost zero deterioration in generalization\nperformance. Adversarial samples and perturbations are constrained within the\nnull-space of the decision boundary utilizing a closed-form null-space\nprojector, effectively mitigating threat of attack stemming from unreliable\nfeatures. Subsequently, we conducted experiments on the CIFAR10 and SVHN\ndatasets and reveal that our methodology can seamlessly combine with\nadversarial training methods and obtain comparable robustness while keeping\ngeneralization close to a high-accuracy model.\n","authors":["Hanyi Hu","Qiao Han","Kui Chen","Yao Yang"],"pdf_url":"https://arxiv.org/pdf/2409.11754v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11741v1","updated":"2024-09-18T06:54:36Z","published":"2024-09-18T06:54:36Z","title":"HARP: Human-Assisted Regrouping with Permutation Invariant Critic for\n Multi-Agent Reinforcement Learning","summary":" Human-in-the-loop reinforcement learning integrates human expertise to\naccelerate agent learning and provide critical guidance and feedback in complex\nfields. However, many existing approaches focus on single-agent tasks and\nrequire continuous human involvement during the training process, significantly\nincreasing the human workload and limiting scalability. In this paper, we\npropose HARP (Human-Assisted Regrouping with Permutation Invariant Critic), a\nmulti-agent reinforcement learning framework designed for group-oriented tasks.\nHARP integrates automatic agent regrouping with strategic human assistance\nduring deployment, enabling and allowing non-experts to offer effective\nguidance with minimal intervention. During training, agents dynamically adjust\ntheir groupings to optimize collaborative task completion. When deployed, they\nactively seek human assistance and utilize the Permutation Invariant Group\nCritic to evaluate and refine human-proposed groupings, allowing non-expert\nusers to contribute valuable suggestions. In multiple collaboration scenarios,\nour approach is able to leverage limited guidance from non-experts and enhance\nperformance. The project can be found at https://github.com/huawen-hu/HARP.\n","authors":["Huawen Hu","Enze Shi","Chenxi Yue","Shuocun Yang","Zihao Wu","Yiwei Li","Tianyang Zhong","Tuo Zhang","Tianming Liu","Shu Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.11741v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2408.13787v2","updated":"2024-09-18T06:44:48Z","published":"2024-08-25T09:30:34Z","title":"Mask-Encoded Sparsification: Mitigating Biased Gradients in\n Communication-Efficient Split Learning","summary":" This paper introduces a novel framework designed to achieve a high\ncompression ratio in Split Learning (SL) scenarios where resource-constrained\ndevices are involved in large-scale model training. Our investigations\ndemonstrate that compressing feature maps within SL leads to biased gradients\nthat can negatively impact the convergence rates and diminish the\ngeneralization capabilities of the resulting models. Our theoretical analysis\nprovides insights into how compression errors critically hinder SL performance,\nwhich previous methodologies underestimate. To address these challenges, we\nemploy a narrow bit-width encoded mask to compensate for the sparsification\nerror without increasing the order of time complexity. Supported by rigorous\ntheoretical analysis, our framework significantly reduces compression errors\nand accelerates the convergence. Extensive experiments also verify that our\nmethod outperforms existing solutions regarding training efficiency and\ncommunication complexity.\n","authors":["Wenxuan Zhou","Zhihao Qu","Shen-Huan Lyu","Miao Cai","Baoliu Ye"],"pdf_url":"https://arxiv.org/pdf/2408.13787v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.17207v5","updated":"2024-09-18T05:53:17Z","published":"2023-09-29T12:59:28Z","title":"Memory Gym: Towards Endless Tasks to Benchmark Memory Capabilities of\n Agents","summary":" Memory Gym presents a suite of 2D partially observable environments, namely\nMortar Mayhem, Mystery Path, and Searing Spotlights, designed to benchmark\nmemory capabilities in decision-making agents. These environments, originally\nwith finite tasks, are expanded into innovative, endless formats, mirroring the\nescalating challenges of cumulative memory games such as ``I packed my bag''.\nThis progression in task design shifts the focus from merely assessing sample\nefficiency to also probing the levels of memory effectiveness in dynamic,\nprolonged scenarios. To address the gap in available memory-based Deep\nReinforcement Learning baselines, we introduce an implementation that\nintegrates Transformer-XL (TrXL) with Proximal Policy Optimization. This\napproach utilizes TrXL as a form of episodic memory, employing a sliding window\ntechnique. Our comparative study between the Gated Recurrent Unit (GRU) and\nTrXL reveals varied performances across different settings. TrXL, on the finite\nenvironments, demonstrates superior sample efficiency in Mystery Path and\noutperforms in Mortar Mayhem. However, GRU is more efficient on Searing\nSpotlights. Most notably, in all endless tasks, GRU makes a remarkable\nresurgence, consistently outperforming TrXL by significant margins. Website and\nSource Code: https://github.com/MarcoMeter/endless-memory-gym/\n","authors":["Marco Pleines","Matthias Pallasch","Frank Zimmer","Mike Preuss"],"pdf_url":"https://arxiv.org/pdf/2309.17207v5.pdf","comment":"40 pages, 12 figures, 7 tables, under review"},{"id":"http://arxiv.org/abs/2409.11713v1","updated":"2024-09-18T05:43:22Z","published":"2024-09-18T05:43:22Z","title":"From exponential to finite/fixed-time stability: Applications to\n optimization","summary":" The development of finite/fixed-time stable optimization algorithms typically\ninvolves study of specific problem instances. The lack of a unified framework\nhinders understanding of more sophisticated algorithms, e.g., primal-dual\ngradient flow dynamics. The purpose of this paper is to address the following\nquestion: Given an exponentially stable optimization algorithm, can it be\nmodified to obtain a finite/fixed-time stable algorithm? We provide an\naffirmative answer, demonstrate how the solution can be computed on a\nfinite-time interval via a simple scaling of the right-hand-side of the\noriginal dynamics, and certify the desired properties of the modified algorithm\nusing the Lyapunov function that proves exponential stability of the original\nsystem. Finally, we examine nonsmooth composite optimization problems and\nsmooth problems with linear constraints to demonstrate the merits of our\napproach.\n","authors":["Ibrahim K. Ozaslan","Mihailo R. Jovanović"],"pdf_url":"https://arxiv.org/pdf/2409.11713v1.pdf","comment":"6 pages; 1 figure"},{"id":"http://arxiv.org/abs/2409.11704v1","updated":"2024-09-18T05:13:18Z","published":"2024-09-18T05:13:18Z","title":"From Lists to Emojis: How Format Bias Affects Model Alignment","summary":" In this paper, we study format biases in reinforcement learning from human\nfeedback (RLHF). We observe that many widely-used preference models, including\nhuman evaluators, GPT-4, and top-ranking models on the RewardBench benchmark,\nexhibit strong biases towards specific format patterns, such as lists, links,\nbold text, and emojis. Furthermore, large language models (LLMs) can exploit\nthese biases to achieve higher rankings on popular benchmarks like AlpacaEval\nand LMSYS Chatbot Arena. One notable example of this is verbosity bias, where\ncurrent preference models favor longer responses that appear more\ncomprehensive, even when their quality is equal to or lower than shorter,\ncompeting responses. However, format biases beyond verbosity remain largely\nunderexplored in the literature. In this work, we extend the study of biases in\npreference learning beyond the commonly recognized length bias, offering a\ncomprehensive analysis of a wider range of format biases. Additionally, we show\nthat with a small amount of biased data (less than 1%), we can inject\nsignificant bias into the reward model. Moreover, these format biases can also\nbe easily exploited by downstream alignment algorithms, such as best-of-n\nsampling and online iterative DPO, as it is usually easier to manipulate the\nformat than to improve the quality of responses. Our findings emphasize the\nneed to disentangle format and content both for designing alignment algorithms\nand evaluating models.\n","authors":["Xuanchang Zhang","Wei Xiong","Lichang Chen","Tianyi Zhou","Heng Huang","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.11704v1.pdf","comment":"Working in progress"},{"id":"http://arxiv.org/abs/2403.18330v3","updated":"2024-09-18T04:54:28Z","published":"2024-03-27T08:11:25Z","title":"Tracking-Assisted Object Detection with Event Cameras","summary":" Event-based object detection has recently garnered attention in the computer\nvision community due to the exceptional properties of event cameras, such as\nhigh dynamic range and no motion blur. However, feature asynchronism and\nsparsity cause invisible objects due to no relative motion to the camera,\nposing a significant challenge in the task. Prior works have studied various\nimplicit-learned memories to retain as many temporal cues as possible. However,\nimplicit memories still struggle to preserve long-term features effectively. In\nthis paper, we consider those invisible objects as pseudo-occluded objects and\naim to detect them by tracking through occlusions. Firstly, we introduce the\nvisibility attribute of objects and contribute an auto-labeling algorithm to\nnot only clean the existing event camera dataset but also append additional\nvisibility labels to it. Secondly, we exploit tracking strategies for\npseudo-occluded objects to maintain their permanence and retain their bounding\nboxes, even when features have not been available for a very long time. These\nstrategies can be treated as an explicit-learned memory guided by the tracking\nobjective to record the displacements of objects across frames. Lastly, we\npropose a spatio-temporal feature aggregation module to enrich the latent\nfeatures and a consistency loss to increase the robustness of the overall\npipeline. We conduct comprehensive experiments to verify our method's\neffectiveness where still objects are retained, but real occluded objects are\ndiscarded. The results demonstrate that (1) the additional visibility labels\ncan assist in supervised training, and (2) our method outperforms\nstate-of-the-art approaches with a significant improvement of 7.9% absolute\nmAP.\n","authors":["Ting-Kang Yen","Igor Morawski","Shusil Dangi","Kai He","Chung-Yi Lin","Jia-Fong Yeh","Hung-Ting Su","Winston Hsu"],"pdf_url":"https://arxiv.org/pdf/2403.18330v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11697v1","updated":"2024-09-18T04:36:05Z","published":"2024-09-18T04:36:05Z","title":"Monomial Matrix Group Equivariant Neural Functional Networks","summary":" Neural functional networks (NFNs) have recently gained significant attention\ndue to their diverse applications, ranging from predicting network\ngeneralization and network editing to classifying implicit neural\nrepresentation. Previous NFN designs often depend on permutation symmetries in\nneural networks' weights, which traditionally arise from the unordered\narrangement of neurons in hidden layers. However, these designs do not take\ninto account the weight scaling symmetries of $\\operatorname{ReLU}$ networks,\nand the weight sign flipping symmetries of $\\operatorname{sin}$ or\n$\\operatorname{tanh}$ networks. In this paper, we extend the study of the group\naction on the network weights from the group of permutation matrices to the\ngroup of monomial matrices by incorporating scaling/sign-flipping symmetries.\nParticularly, we encode these scaling/sign-flipping symmetries by designing our\ncorresponding equivariant and invariant layers. We name our new family of NFNs\nthe Monomial Matrix Group Equivariant Neural Functional Networks\n(Monomial-NFN). Because of the expansion of the symmetries, Monomial-NFN has\nmuch fewer independent trainable parameters compared to the baseline NFNs in\nthe literature, thus enhancing the model's efficiency. Moreover, for fully\nconnected and convolutional neural networks, we theoretically prove that all\ngroups that leave these networks invariant while acting on their weight spaces\nare some subgroups of the monomial matrix group. We provide empirical evidences\nto demonstrate the advantages of our model over existing baselines, achieving\ncompetitive performance and efficiency.\n","authors":["Hoang V. Tran","Thieu N. Vo","Tho H. Tran","An T. Nguyen","Tan Minh Nguyen"],"pdf_url":"https://arxiv.org/pdf/2409.11697v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11686v1","updated":"2024-09-18T03:56:56Z","published":"2024-09-18T03:56:56Z","title":"Detecting Underdiagnosed Medical Conditions with Deep Learning-Based\n Opportunistic CT Imaging","summary":" Abdominal computed tomography (CT) scans are frequently performed in clinical\nsettings. Opportunistic CT involves repurposing routine CT images to extract\ndiagnostic information and is an emerging tool for detecting underdiagnosed\nconditions such as sarcopenia, hepatic steatosis, and ascites. This study\nutilizes deep learning methods to promote accurate diagnosis and clinical\ndocumentation. We analyze 2,674 inpatient CT scans to identify discrepancies\nbetween imaging phenotypes (characteristics derived from opportunistic CT\nscans) and their corresponding documentation in radiology reports and ICD\ncoding. Through our analysis, we find that only 0.5%, 3.2%, and 30.7% of scans\ndiagnosed with sarcopenia, hepatic steatosis, and ascites (respectively)\nthrough either opportunistic imaging or radiology reports were ICD-coded. Our\nfindings demonstrate opportunistic CT's potential to enhance diagnostic\nprecision and accuracy of risk adjustment models, offering advancements in\nprecision medicine.\n","authors":["Asad Aali","Andrew Johnston","Louis Blankemeier","Dave Van Veen","Laura T Derry","David Svec","Jason Hom","Robert D. Boutin","Akshay S. Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2409.11686v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11684v1","updated":"2024-09-18T03:52:48Z","published":"2024-09-18T03:52:48Z","title":"Recurrent Interpolants for Probabilistic Time Series Prediction","summary":" Sequential models such as recurrent neural networks or transformer-based\nmodels became \\textit{de facto} tools for multivariate time series forecasting\nin a probabilistic fashion, with applications to a wide range of datasets, such\nas finance, biology, medicine, etc. Despite their adeptness in capturing\ndependencies, assessing prediction uncertainty, and efficiency in training,\nchallenges emerge in modeling high-dimensional complex distributions and\ncross-feature dependencies. To tackle these issues, recent works delve into\ngenerative modeling by employing diffusion or flow-based models. Notably, the\nintegration of stochastic differential equations or probability flow\nsuccessfully extends these methods to probabilistic time series imputation and\nforecasting. However, scalability issues necessitate a computational-friendly\nframework for large-scale generative model-based predictions. This work\nproposes a novel approach by blending the computational efficiency of recurrent\nneural networks with the high-quality probabilistic modeling of the diffusion\nmodel, which addresses challenges and advances generative models' application\nin time series forecasting. Our method relies on the foundation of stochastic\ninterpolants and the extension to a broader conditional generation framework\nwith additional control features, offering insights for future developments in\nthis dynamic field.\n","authors":["Yu Chen","Marin Biloš","Sarthak Mittal","Wei Deng","Kashif Rasul","Anderson Schneider"],"pdf_url":"https://arxiv.org/pdf/2409.11684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.08159v2","updated":"2024-09-18T03:38:16Z","published":"2024-07-11T03:25:40Z","title":"Model-agnostic clean-label backdoor mitigation in cybersecurity\n environments","summary":" The training phase of machine learning models is a delicate step, especially\nin cybersecurity contexts. Recent research has surfaced a series of insidious\ntraining-time attacks that inject backdoors in models designed for security\nclassification tasks without altering the training labels. With this work, we\npropose new techniques that leverage insights in cybersecurity threat models to\neffectively mitigate these clean-label poisoning attacks, while preserving the\nmodel utility. By performing density-based clustering on a carefully chosen\nfeature subspace, and progressively isolating the suspicious clusters through a\nnovel iterative scoring procedure, our defensive mechanism can mitigate the\nattacks without requiring many of the common assumptions in the existing\nbackdoor defense literature. To show the generality of our proposed mitigation,\nwe evaluate it on two clean-label model-agnostic attacks on two different\nclassic cybersecurity data modalities: network flows classification and malware\nclassification, using gradient boosting and neural network models.\n","authors":["Giorgio Severi","Simona Boboila","John Holodnak","Kendra Kratkiewicz","Rauf Izmailov","Alina Oprea"],"pdf_url":"https://arxiv.org/pdf/2407.08159v2.pdf","comment":"14 pages, 8 figures"},{"id":"http://arxiv.org/abs/2409.11678v1","updated":"2024-09-18T03:34:31Z","published":"2024-09-18T03:34:31Z","title":"An Enhanced-State Reinforcement Learning Algorithm for Multi-Task Fusion\n in Large-Scale Recommender Systems","summary":" As the last key stage of Recommender Systems (RSs), Multi-Task Fusion (MTF)\nis in charge of combining multiple scores predicted by Multi-Task Learning\n(MTL) into a final score to maximize user satisfaction, which decides the\nultimate recommendation results. In recent years, to maximize long-term user\nsatisfaction within a recommendation session, Reinforcement Learning (RL) is\nwidely used for MTF in large-scale RSs. However, limited by their modeling\npattern, all the current RL-MTF methods can only utilize user features as the\nstate to generate actions for each user, but unable to make use of item\nfeatures and other valuable features, which leads to suboptimal results.\nAddressing this problem is a challenge that requires breaking through the\ncurrent modeling pattern of RL-MTF. To solve this problem, we propose a novel\nmethod called Enhanced-State RL for MTF in RSs. Unlike the existing methods\nmentioned above, our method first defines user features, item features, and\nother valuable features collectively as the enhanced state; then proposes a\nnovel actor and critic learning process to utilize the enhanced state to make\nmuch better action for each user-item pair. To the best of our knowledge, this\nnovel modeling pattern is being proposed for the first time in the field of\nRL-MTF. We conduct extensive offline and online experiments in a large-scale\nRS. The results demonstrate that our model outperforms other models\nsignificantly. Enhanced-State RL has been fully deployed in our RS more than\nhalf a year, improving +3.84% user valid consumption and +0.58% user duration\ntime compared to baseline.\n","authors":["Peng Liu","Jiawei Zhu","Cong Xu","Ming Zhao","Bin Wang"],"pdf_url":"https://arxiv.org/pdf/2409.11678v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2404.17589"},{"id":"http://arxiv.org/abs/2409.11676v1","updated":"2024-09-18T03:30:38Z","published":"2024-09-18T03:30:38Z","title":"Hypergraph-based Motion Generation with Multi-modal Interaction\n Relational Reasoning","summary":" The intricate nature of real-world driving environments, characterized by\ndynamic and diverse interactions among multiple vehicles and their possible\nfuture states, presents considerable challenges in accurately predicting the\nmotion states of vehicles and handling the uncertainty inherent in the\npredictions. Addressing these challenges requires comprehensive modeling and\nreasoning to capture the implicit relations among vehicles and the\ncorresponding diverse behaviors. This research introduces an integrated\nframework for autonomous vehicles (AVs) motion prediction to address these\ncomplexities, utilizing a novel Relational Hypergraph Interaction-informed\nNeural mOtion generator (RHINO). RHINO leverages hypergraph-based relational\nreasoning by integrating a multi-scale hypergraph neural network to model\ngroup-wise interactions among multiple vehicles and their multi-modal driving\nbehaviors, thereby enhancing motion prediction accuracy and reliability.\nExperimental validation using real-world datasets demonstrates the superior\nperformance of this framework in improving predictive accuracy and fostering\nsocially aware automated driving in dynamic traffic scenarios.\n","authors":["Keshu Wu","Yang Zhou","Haotian Shi","Dominique Lord","Bin Ran","Xinyue Ye"],"pdf_url":"https://arxiv.org/pdf/2409.11676v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.04315v2","updated":"2024-09-18T02:56:59Z","published":"2022-01-12T05:45:27Z","title":"On the Statistical Complexity of Sample Amplification","summary":" The ``sample amplification'' problem formalizes the following question: Given\n$n$ i.i.d. samples drawn from an unknown distribution $P$, when is it possible\nto produce a larger set of $n+m$ samples which cannot be distinguished from\n$n+m$ i.i.d. samples drawn from $P$? In this work, we provide a firm\nstatistical foundation for this problem by deriving generally applicable\namplification procedures, lower bound techniques and connections to existing\nstatistical notions. Our techniques apply to a large class of distributions\nincluding the exponential family, and establish a rigorous connection between\nsample amplification and distribution learning.\n","authors":["Brian Axelrod","Shivam Garg","Yanjun Han","Vatsal Sharan","Gregory Valiant"],"pdf_url":"https://arxiv.org/pdf/2201.04315v2.pdf","comment":"To appear in the Annals of Statistics"},{"id":"http://arxiv.org/abs/2409.11657v1","updated":"2024-09-18T02:48:36Z","published":"2024-09-18T02:48:36Z","title":"Few-Shot Class-Incremental Learning with Non-IID Decentralized Data","summary":" Few-shot class-incremental learning is crucial for developing scalable and\nadaptive intelligent systems, as it enables models to acquire new classes with\nminimal annotated data while safeguarding the previously accumulated knowledge.\nNonetheless, existing methods deal with continuous data streams in a\ncentralized manner, limiting their applicability in scenarios that prioritize\ndata privacy and security. To this end, this paper introduces federated\nfew-shot class-incremental learning, a decentralized machine learning paradigm\ntailored to progressively learn new classes from scarce data distributed across\nmultiple clients. In this learning paradigm, clients locally update their\nmodels with new classes while preserving data privacy, and then transmit the\nmodel updates to a central server where they are aggregated globally. However,\nthis paradigm faces several issues, such as difficulties in few-shot learning,\ncatastrophic forgetting, and data heterogeneity. To address these challenges,\nwe present a synthetic data-driven framework that leverages replay buffer data\nto maintain existing knowledge and facilitate the acquisition of new knowledge.\nWithin this framework, a noise-aware generative replay module is developed to\nfine-tune local models with a balance of new and replay data, while generating\nsynthetic data of new classes to further expand the replay buffer for future\ntasks. Furthermore, a class-specific weighted aggregation strategy is designed\nto tackle data heterogeneity by adaptively aggregating class-specific\nparameters based on local models performance on synthetic data. This enables\neffective global model optimization without direct access to client data.\nComprehensive experiments across three widely-used datasets underscore the\neffectiveness and preeminence of the introduced framework.\n","authors":["Cuiwei Liu","Siang Xu","Huaijun Qiu","Jing Zhang","Zhi Liu","Liang Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.11657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11654v1","updated":"2024-09-18T02:41:50Z","published":"2024-09-18T02:41:50Z","title":"How to Build the Virtual Cell with Artificial Intelligence: Priorities\n and Opportunities","summary":" The cell is arguably the smallest unit of life and is central to\nunderstanding biology. Accurate modeling of cells is important for this\nunderstanding as well as for determining the root causes of disease. Recent\nadvances in artificial intelligence (AI), combined with the ability to generate\nlarge-scale experimental data, present novel opportunities to model cells. Here\nwe propose a vision of AI-powered Virtual Cells, where robust representations\nof cells and cellular systems under different conditions are directly learned\nfrom growing biological data across measurements and scales. We discuss desired\ncapabilities of AI Virtual Cells, including generating universal\nrepresentations of biological entities across scales, and facilitating\ninterpretable in silico experiments to predict and understand their behavior\nusing Virtual Instruments. We further address the challenges, opportunities and\nrequirements to realize this vision including data needs, evaluation\nstrategies, and community standards and engagement to ensure biological\naccuracy and broad utility. We envision a future where AI Virtual Cells help\nidentify new drug targets, predict cellular responses to perturbations, as well\nas scale hypothesis exploration. With open science collaborations across the\nbiomedical ecosystem that includes academia, philanthropy, and the biopharma\nand AI industries, a comprehensive predictive understanding of cell mechanisms\nand interactions is within reach.\n","authors":["Charlotte Bunne","Yusuf Roohani","Yanay Rosen","Ankit Gupta","Xikun Zhang","Marcel Roed","Theo Alexandrov","Mohammed AlQuraishi","Patricia Brennan","Daniel B. Burkhardt","Andrea Califano","Jonah Cool","Abby F. Dernburg","Kirsty Ewing","Emily B. Fox","Matthias Haury","Amy E. Herr","Eric Horvitz","Patrick D. Hsu","Viren Jain","Gregory R. Johnson","Thomas Kalil","David R. Kelley","Shana O. Kelley","Anna Kreshuk","Tim Mitchison","Stephani Otte","Jay Shendure","Nicholas J. Sofroniew","Fabian Theis","Christina V. Theodoris","Srigokul Upadhyayula","Marc Valer","Bo Wang","Eric Xing","Serena Yeung-Levy","Marinka Zitnik","Theofanis Karaletsos","Aviv Regev","Emma Lundberg","Jure Leskovec","Stephen R. Quake"],"pdf_url":"https://arxiv.org/pdf/2409.11654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11653v1","updated":"2024-09-18T02:40:31Z","published":"2024-09-18T02:40:31Z","title":"Enhancing Semi-Supervised Learning via Representative and Diverse Sample\n Selection","summary":" Semi-Supervised Learning (SSL) has become a preferred paradigm in many deep\nlearning tasks, which reduces the need for human labor. Previous studies\nprimarily focus on effectively utilising the labelled and unlabeled data to\nimprove performance. However, we observe that how to select samples for\nlabelling also significantly impacts performance, particularly under extremely\nlow-budget settings. The sample selection task in SSL has been under-explored\nfor a long time. To fill in this gap, we propose a Representative and Diverse\nSample Selection approach (RDSS). By adopting a modified Frank-Wolfe algorithm\nto minimise a novel criterion $\\alpha$-Maximum Mean Discrepancy ($\\alpha$-MMD),\nRDSS samples a representative and diverse subset for annotation from the\nunlabeled data. We demonstrate that minimizing $\\alpha$-MMD enhances the\ngeneralization ability of low-budget learning. Experimental results show that\nRDSS consistently improves the performance of several popular SSL frameworks\nand outperforms the state-of-the-art sample selection approaches used in Active\nLearning (AL) and Semi-Supervised Active Learning (SSAL), even with constrained\nannotation budgets.\n","authors":["Qian Shao","Jiangrui Kang","Qiyuan Chen","Zepeng Li","Hongxia Xu","Yiwen Cao","Jiajuan Liang","Jian Wu"],"pdf_url":"https://arxiv.org/pdf/2409.11653v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2409.11650v1","updated":"2024-09-18T02:35:00Z","published":"2024-09-18T02:35:00Z","title":"Art and Science of Quantizing Large-Scale Models: A Comprehensive\n Overview","summary":" This paper provides a comprehensive overview of the principles, challenges,\nand methodologies associated with quantizing large-scale neural network models.\nAs neural networks have evolved towards larger and more complex architectures\nto address increasingly sophisticated tasks, the computational and energy costs\nhave escalated significantly. We explore the necessity and impact of model size\ngrowth, highlighting the performance benefits as well as the computational\nchallenges and environmental considerations. The core focus is on model\nquantization as a fundamental approach to mitigate these challenges by reducing\nmodel size and improving efficiency without substantially compromising\naccuracy. We delve into various quantization techniques, including both\npost-training quantization (PTQ) and quantization-aware training (QAT), and\nanalyze several state-of-the-art algorithms such as LLM-QAT, PEQA(L4Q),\nZeroQuant, SmoothQuant, and others. Through comparative analysis, we examine\nhow these methods address issues like outliers, importance weighting, and\nactivation quantization, ultimately contributing to more sustainable and\naccessible deployment of large-scale models.\n","authors":["Yanshu Wang","Tong Yang","Xiyan Liang","Guoan Wang","Hanning Lu","Xu Zhe","Yaoming Li","Li Weitao"],"pdf_url":"https://arxiv.org/pdf/2409.11650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11646v1","updated":"2024-09-18T02:17:10Z","published":"2024-09-18T02:17:10Z","title":"Hard-Label Cryptanalytic Extraction of Neural Network Models","summary":" The machine learning problem of extracting neural network parameters has been\nproposed for nearly three decades. Functionally equivalent extraction is a\ncrucial goal for research on this problem. When the adversary has access to the\nraw output of neural networks, various attacks, including those presented at\nCRYPTO 2020 and EUROCRYPT 2024, have successfully achieved this goal. However,\nthis goal is not achieved when neural networks operate under a hard-label\nsetting where the raw output is inaccessible.\n In this paper, we propose the first attack that theoretically achieves\nfunctionally equivalent extraction under the hard-label setting, which applies\nto ReLU neural networks. The effectiveness of our attack is validated through\npractical experiments on a wide range of ReLU neural networks, including neural\nnetworks trained on two real benchmarking datasets (MNIST, CIFAR10) widely used\nin computer vision. For a neural network consisting of $10^5$ parameters, our\nattack only requires several hours on a single core.\n","authors":["Yi Chen","Xiaoyang Dong","Jian Guo","Yantian Shen","Anyu Wang","Xiaoyun Wang"],"pdf_url":"https://arxiv.org/pdf/2409.11646v1.pdf","comment":"Accepted by Asiacrypt 2024"},{"id":"http://arxiv.org/abs/2409.11642v1","updated":"2024-09-18T02:14:08Z","published":"2024-09-18T02:14:08Z","title":"DAF-Net: A Dual-Branch Feature Decomposition Fusion Network with Domain\n Adaptive for Infrared and Visible Image Fusion","summary":" Infrared and visible image fusion aims to combine complementary information\nfrom both modalities to provide a more comprehensive scene understanding.\nHowever, due to the significant differences between the two modalities,\npreserving key features during the fusion process remains a challenge. To\naddress this issue, we propose a dual-branch feature decomposition fusion\nnetwork (DAF-Net) with domain adaptive, which introduces Multi-Kernel Maximum\nMean Discrepancy (MK-MMD) into the base encoder and designs a hybrid kernel\nfunction suitable for infrared and visible image fusion. The base encoder built\non the Restormer network captures global structural information while the\ndetail encoder based on Invertible Neural Networks (INN) focuses on extracting\ndetail texture information. By incorporating MK-MMD, the DAF-Net effectively\naligns the latent feature spaces of visible and infrared images, thereby\nimproving the quality of the fused images. Experimental results demonstrate\nthat the proposed method outperforms existing techniques across multiple\ndatasets, significantly enhancing both visual quality and fusion performance.\nThe related Python code is available at https://github.com/xujian000/DAF-Net.\n","authors":["Jian Xu","Xin He"],"pdf_url":"https://arxiv.org/pdf/2409.11642v1.pdf","comment":"5pages,4figures"},{"id":"http://arxiv.org/abs/2409.11640v1","updated":"2024-09-18T02:08:17Z","published":"2024-09-18T02:08:17Z","title":"Enhancing PM2.5 Data Imputation and Prediction in Air Quality Monitoring\n Networks Using a KNN-SINDy Hybrid Model","summary":" Air pollution, particularly particulate matter (PM2.5), poses significant\nrisks to public health and the environment, necessitating accurate prediction\nand continuous monitoring for effective air quality management. However, air\nquality monitoring (AQM) data often suffer from missing records due to various\ntechnical difficulties. This study explores the application of Sparse\nIdentification of Nonlinear Dynamics (SINDy) for imputing missing PM2.5 data by\npredicting, using training data from 2016, and comparing its performance with\nthe established Soft Impute (SI) and K-Nearest Neighbors (KNN) methods.\n","authors":["Yohan Choi","Boaz Choi","Jachin Choi"],"pdf_url":"https://arxiv.org/pdf/2409.11640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.13803v3","updated":"2024-09-18T01:58:09Z","published":"2022-09-28T03:14:10Z","title":"FedVeca: Federated Vectorized Averaging on Non-IID Data with Adaptive\n Bi-directional Global Objective","summary":" Federated Learning (FL) is a distributed machine learning framework to\nalleviate the data silos, where decentralized clients collaboratively learn a\nglobal model without sharing their private data. However, the clients'\nNon-Independent and Identically Distributed (Non-IID) data negatively affect\nthe trained model, and clients with different numbers of local updates may\ncause significant gaps to the local gradients in each communication round. In\nthis paper, we propose a Federated Vectorized Averaging (FedVeca) method to\naddress the above problem on Non-IID data. Specifically, we set a novel\nobjective for the global model which is related to the local gradients. The\nlocal gradient is defined as a bi-directional vector with step size and\ndirection, where the step size is the number of local updates and the direction\nis divided into positive and negative according to our definition. In FedVeca,\nthe direction is influenced by the step size, thus we average the\nbi-directional vectors to reduce the effect of different step sizes. Then, we\ntheoretically analyze the relationship between the step sizes and the global\nobjective, and obtain upper bounds on the step sizes per communication round.\nBased on the upper bounds, we design an algorithm for the server and the client\nto adaptively adjusts the step sizes that make the objective close to the\noptimum. Finally, we conduct experiments on different datasets, models and\nscenarios by building a prototype system, and the experimental results\ndemonstrate the effectiveness and efficiency of the FedVeca method.\n","authors":["Ping Luo","Jieren Cheng","Zhenhao Liu","N. Xiong","Jie Wu"],"pdf_url":"https://arxiv.org/pdf/2209.13803v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09945v3","updated":"2024-09-18T01:54:50Z","published":"2024-09-16T02:37:51Z","title":"Mobility-GCN: a human mobility-based graph convolutional network for\n tracking and analyzing the spatial dynamics of the synthetic opioid crisis in\n the USA, 2013-2020","summary":" Synthetic opioids are the most common drugs involved in drug-involved\noverdose mortalities in the U.S. The Center for Disease Control and Prevention\nreported that in 2018, about 70% of all drug overdose deaths involved opioids\nand 67% of all opioid-involved deaths were accounted for by synthetic opioids.\nIn this study, we investigated the spread of synthetic opioids between 2013 and\n2020 in the U.S. We analyzed the relationship between the spatiotemporal\npattern of synthetic opioid-involved deaths and another key opioid, heroin, and\ncompared patterns of deaths involving these two types of drugs during this\nperiod. Spatial connections and human mobility between counties were\nincorporated into a graph convolutional neural network model to represent and\nanalyze the spread of synthetic opioid-involved deaths in the context of\nprevious heroin-involved death patterns.\n","authors":["Zhiyue Xia","Kathleen Stewart"],"pdf_url":"https://arxiv.org/pdf/2409.09945v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11624v1","updated":"2024-09-18T01:08:49Z","published":"2024-09-18T01:08:49Z","title":"Multimodal Generalized Category Discovery","summary":" Generalized Category Discovery (GCD) aims to classify inputs into both known\nand novel categories, a task crucial for open-world scientific discoveries.\nHowever, current GCD methods are limited to unimodal data, overlooking the\ninherently multimodal nature of most real-world data. In this work, we extend\nGCD to a multimodal setting, where inputs from different modalities provide\nricher and complementary information. Through theoretical analysis and\nempirical validation, we identify that the key challenge in multimodal GCD lies\nin effectively aligning heterogeneous information across modalities. To address\nthis, we propose MM-GCD, a novel framework that aligns both the feature and\noutput spaces of different modalities using contrastive learning and\ndistillation techniques. MM-GCD achieves new state-of-the-art performance on\nthe UPMC-Food101 and N24News datasets, surpassing previous methods by 11.5\\%\nand 4.7\\%, respectively.\n","authors":["Yuchang Su","Renping Zhou","Siyu Huang","Xingjian Li","Tianyang Wang","Ziyue Wang","Min Xu"],"pdf_url":"https://arxiv.org/pdf/2409.11624v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2405.17512v2","updated":"2024-09-18T00:55:35Z","published":"2024-05-27T07:37:43Z","title":"On Fairness of Low-Rank Adaptation of Large Models","summary":" Low-rank adaptation of large models, particularly LoRA, has gained traction\ndue to its computational efficiency. This efficiency, contrasted with the\nprohibitive costs of full-model fine-tuning, means that practitioners often\nturn to LoRA and sometimes without a complete understanding of its\nramifications. In this study, we focus on fairness and ask whether LoRA has an\nunexamined impact on utility, calibration, and resistance to membership\ninference across different subgroups (e.g., genders, races, religions) compared\nto a full-model fine-tuning baseline. We present extensive experiments across\nvision and language domains and across classification and generation tasks\nusing ViT-Base, Swin-v2-Large, Llama-2 7B, and Mistral 7B. Intriguingly,\nexperiments suggest that while one can isolate cases where LoRA exacerbates\nmodel bias across subgroups, the pattern is inconsistent -- in many cases, LoRA\nhas equivalent or even improved fairness compared to the base model or its full\nfine-tuning baseline. We also examine the complications of evaluating\nfine-tuning fairness relating to task design and model token bias, calling for\nmore careful fairness evaluations in future work.\n","authors":["Zhoujie Ding","Ken Ziyu Liu","Pura Peetathawatchai","Berivan Isik","Sanmi Koyejo"],"pdf_url":"https://arxiv.org/pdf/2405.17512v2.pdf","comment":"COLM 2024 camera ready"},{"id":"http://arxiv.org/abs/2409.11618v1","updated":"2024-09-18T00:49:42Z","published":"2024-09-18T00:49:42Z","title":"PieClam: A Universal Graph Autoencoder Based on Overlapping Inclusive\n and Exclusive Communities","summary":" We propose PieClam (Prior Inclusive Exclusive Cluster Affiliation Model): a\nprobabilistic graph model for representing any graph as overlapping generalized\ncommunities. Our method can be interpreted as a graph autoencoder: nodes are\nembedded into a code space by an algorithm that maximizes the log-likelihood of\nthe decoded graph, given the input graph. PieClam is a community affiliation\nmodel that extends well-known methods like BigClam in two main manners. First,\ninstead of the decoder being defined via pairwise interactions between the\nnodes in the code space, we also incorporate a learned prior on the\ndistribution of nodes in the code space, turning our method into a graph\ngenerative model. Secondly, we generalize the notion of communities by allowing\nnot only sets of nodes with strong connectivity, which we call inclusive\ncommunities, but also sets of nodes with strong disconnection, which we call\nexclusive communities. To model both types of communities, we propose a new\ntype of decoder based the Lorentz inner product, which we prove to be much more\nexpressive than standard decoders based on standard inner products or norm\ndistances. By introducing a new graph similarity measure, that we call the log\ncut distance, we show that PieClam is a universal autoencoder, able to\nuniformly approximately reconstruct any graph. Our method is shown to obtain\ncompetitive performance in graph anomaly detection benchmarks.\n","authors":["Daniel Zilberg","Ron Levie"],"pdf_url":"https://arxiv.org/pdf/2409.11618v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14416v3","updated":"2024-09-18T00:39:53Z","published":"2022-10-26T01:58:09Z","title":"Residual Back Projection With Untrained Neural Networks","summary":" Background and Objective: The success of neural networks in a number of image\nprocessing tasks has motivated their application in image reconstruction\nproblems in computed tomography (CT). While progress has been made in this\narea, the lack of stability and theoretical guarantees for accuracy, together\nwith the scarcity of high-quality training data for specific imaging domains\npose challenges for many CT applications. In this paper, we present a framework\nfor iterative reconstruction (IR) in CT that leverages the hierarchical\nstructure of neural networks, without the need for training. Our framework\nincorporates this structural information as a deep image prior (DIP), and uses\na novel residual back projection (RBP) connection that forms the basis for our\niterations.\n Methods: We propose using an untrained U-net in conjunction with a novel\nresidual back projection to minimize an objective function and achieve\nhigh-accuracy reconstruction. In each iteration, the weights of the untrained\nU-net are optimized, and the output of the U-net in the current iteration is\nused to update the input of the U-net in the next iteration through the\naforementioned RBP connection.\n Results: Experimental results demonstrate that the RBP-DIP framework offers\nimprovements over other state-of-the-art conventional IR methods, as well as\npre-trained and untrained models with similar network structures under multiple\nconditions. These improvements are particularly significant in the few-view,\nlimited-angle, and low-dose imaging configurations.\n Conclusions: Applying to both parallel and fan beam X-ray imaging, our\nframework shows significant improvement under multiple conditions. Furthermore,\nthe proposed framework requires no training data and can be adjusted on-demand\nto adapt to different conditions (e.g. noise level, geometry, and imaged\nobject).\n","authors":["Ziyu Shu","Alireza Entezari"],"pdf_url":"https://arxiv.org/pdf/2210.14416v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.00856v3","updated":"2024-09-18T00:39:43Z","published":"2024-08-01T18:10:05Z","title":"Enhancing Changepoint Detection: Penalty Learning through Deep Learning\n Techniques","summary":" Changepoint detection, a technique for identifying significant shifts within\ndata sequences, is crucial in various fields such as finance, genomics,\nmedicine, etc. Dynamic programming changepoint detection algorithms are\nemployed to identify the locations of changepoints within a sequence, which\nrely on a penalty parameter to regulate the number of changepoints. To estimate\nthis penalty parameter, previous work uses simple models such as linear or\ntree-based models. This study introduces a novel deep learning method for\npredicting penalty parameters, leading to demonstrably improved changepoint\ndetection accuracy on large benchmark supervised labeled datasets compared to\nprevious methods.\n","authors":["Tung L Nguyen","Toby Dylan Hocking"],"pdf_url":"https://arxiv.org/pdf/2408.00856v3.pdf","comment":"17 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.10777v6","updated":"2024-09-18T00:16:27Z","published":"2023-11-16T06:01:47Z","title":"A Systematic Review of Aspect-based Sentiment Analysis: Domains,\n Methods, and Trends","summary":" Aspect-based sentiment analysis (ABSA) is a fine-grained type of sentiment\nanalysis that identifies aspects and their associated opinions from a given\ntext. With the surge of digital opinionated text data, ABSA gained increasing\npopularity for its ability to mine more detailed and targeted insights. Many\nreview papers on ABSA subtasks and solution methodologies exist, however, few\nfocus on trends over time or systemic issues relating to research application\ndomains, datasets, and solution approaches. To fill the gap, this paper\npresents a systematic literature review (SLR) of ABSA studies with a focus on\ntrends and high-level relationships among these fundamental components. This\nreview is one of the largest SLRs on ABSA. To our knowledge, it is also the\nfirst to systematically examine the interrelations among ABSA research and data\ndistribution across domains, as well as trends in solution paradigms and\napproaches. Our sample includes 727 primary studies screened from 8550 search\nresults without time constraints via an innovative automatic filtering process.\nOur quantitative analysis not only identifies trends in nearly two decades of\nABSA research development but also unveils a systemic lack of dataset and\ndomain diversity as well as domain mismatch that may hinder the development of\nfuture ABSA research. We discuss these findings and their implications and\npropose suggestions for future research.\n","authors":["Yan Cathy Hua","Paul Denny","Katerina Taskova","Jörg Wicker"],"pdf_url":"https://arxiv.org/pdf/2311.10777v6.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.12193v1","updated":"2024-09-18T17:59:44Z","published":"2024-09-18T17:59:44Z","title":"Vista3D: Unravel the 3D Darkside of a Single Image","summary":" We embark on the age-old quest: unveiling the hidden dimensions of objects\nfrom mere glimpses of their visible parts. To address this, we present Vista3D,\na framework that realizes swift and consistent 3D generation within a mere 5\nminutes. At the heart of Vista3D lies a two-phase approach: the coarse phase\nand the fine phase. In the coarse phase, we rapidly generate initial geometry\nwith Gaussian Splatting from a single image. In the fine phase, we extract a\nSigned Distance Function (SDF) directly from learned Gaussian Splatting,\noptimizing it with a differentiable isosurface representation. Furthermore, it\nelevates the quality of generation by using a disentangled representation with\ntwo independent implicit functions to capture both visible and obscured aspects\nof objects. Additionally, it harmonizes gradients from 2D diffusion prior with\n3D-aware diffusion priors by angular diffusion prior composition. Through\nextensive evaluation, we demonstrate that Vista3D effectively sustains a\nbalance between the consistency and diversity of the generated 3D objects.\nDemos and code will be available at https://github.com/florinshen/Vista3D.\n","authors":["Qiuhong Shen","Xingyi Yang","Michael Bi Mi","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2409.12193v1.pdf","comment":"ECCV'2024"},{"id":"http://arxiv.org/abs/2409.12140v1","updated":"2024-09-18T17:03:30Z","published":"2024-09-18T17:03:30Z","title":"MoRAG -- Multi-Fusion Retrieval Augmented Generation for Human Motion","summary":" We introduce MoRAG, a novel multi-part fusion based retrieval-augmented\ngeneration strategy for text-based human motion generation. The method enhances\nmotion diffusion models by leveraging additional knowledge obtained through an\nimproved motion retrieval process. By effectively prompting large language\nmodels (LLMs), we address spelling errors and rephrasing issues in motion\nretrieval. Our approach utilizes a multi-part retrieval strategy to improve the\ngeneralizability of motion retrieval across the language space. We create\ndiverse samples through the spatial composition of the retrieved motions.\nFurthermore, by utilizing low-level, part-specific motion information, we can\nconstruct motion samples for unseen text descriptions. Our experiments\ndemonstrate that our framework can serve as a plug-and-play module, improving\nthe performance of motion diffusion models. Code, pretrained models and sample\nvideos will be made available at: https://motion-rag.github.io/\n","authors":["Kalakonda Sai Shashank","Shubh Maheshwari","Ravi Kiran Sarvadevabhatla"],"pdf_url":"https://arxiv.org/pdf/2409.12140v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2402.05508v2","updated":"2024-09-18T09:36:32Z","published":"2024-02-08T09:37:12Z","title":"Performance Evaluation of Associative Watermarking Using Statistical\n Neurodynamics","summary":" We theoretically evaluated the performance of our proposed associative\nwatermarking method in which the watermark is not embedded directly into the\nimage. We previously proposed a watermarking method that extends the\nzero-watermarking model by applying associative memory models. In this model,\nthe hetero-associative memory model is introduced to the mapping process\nbetween image features and watermarks, and the auto-associative memory model is\napplied to correct watermark errors. We herein show that the associative\nwatermarking model outperforms the zero-watermarking model through computer\nsimulations using actual images. In this paper, we describe how we derive the\nmacroscopic state equation for the associative watermarking model using the\nOkada theory. The theoretical results obtained by the fourth-order theory were\nin good agreement with those obtained by computer simulations. Furthermore, the\nperformance of the associative watermarking model was evaluated using the bit\nerror rate of the watermark, both theoretically and using computer simulations.\n","authors":["Ryoto Kanegae","Masaki Kawamura"],"pdf_url":"https://arxiv.org/pdf/2402.05508v2.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.11786v1","updated":"2024-09-18T08:10:35Z","published":"2024-09-18T08:10:35Z","title":"Efficient Low-Resolution Face Recognition via Bridge Distillation","summary":" Face recognition in the wild is now advancing towards light-weight models,\nfast inference speed and resolution-adapted capability. In this paper, we\npropose a bridge distillation approach to turn a complex face model pretrained\non private high-resolution faces into a light-weight one for low-resolution\nface recognition. In our approach, such a cross-dataset resolution-adapted\nknowledge transfer problem is solved via two-step distillation. In the first\nstep, we conduct cross-dataset distillation to transfer the prior knowledge\nfrom private high-resolution faces to public high-resolution faces and generate\ncompact and discriminative features. In the second step, the resolution-adapted\ndistillation is conducted to further transfer the prior knowledge to synthetic\nlow-resolution faces via multi-task learning. By learning low-resolution face\nrepresentations and mimicking the adapted high-resolution knowledge, a\nlight-weight student model can be constructed with high efficiency and\npromising accuracy in recognizing low-resolution faces. Experimental results\nshow that the student model performs impressively in recognizing low-resolution\nfaces with only 0.21M parameters and 0.057MB memory. Meanwhile, its speed\nreaches up to 14,705, ~934 and 763 faces per second on GPU, CPU and mobile\nphone, respectively.\n","authors":["Shiming Ge","Shengwei Zhao","Chenyu Li","Yu Zhang","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2409.11786v1.pdf","comment":"This paper is published in IEEE TIP 2020"},{"id":"http://arxiv.org/abs/2409.11729v1","updated":"2024-09-18T06:38:48Z","published":"2024-09-18T06:38:48Z","title":"DETECLAP: Enhancing Audio-Visual Representation Learning with Object\n Information","summary":" Current audio-visual representation learning can capture rough object\ncategories (e.g., ``animals'' and ``instruments''), but it lacks the ability to\nrecognize fine-grained details, such as specific categories like ``dogs'' and\n``flutes'' within animals and instruments. To address this issue, we introduce\nDETECLAP, a method to enhance audio-visual representation learning with object\ninformation. Our key idea is to introduce an audio-visual label prediction loss\nto the existing Contrastive Audio-Visual Masked AutoEncoder to enhance its\nobject awareness. To avoid costly manual annotations, we prepare object labels\nfrom both audio and visual inputs using state-of-the-art language-audio models\nand object detectors. We evaluate the method of audio-visual retrieval and\nclassification using the VGGSound and AudioSet20K datasets. Our method achieves\nimprovements in recall@10 of +1.5% and +1.2% for audio-to-visual and\nvisual-to-audio retrieval, respectively, and an improvement in accuracy of\n+0.6% for audio-visual classification.\n","authors":["Shota Nakada","Taichi Nishimura","Hokuto Munakata","Masayoshi Kondo","Tatsuya Komatsu"],"pdf_url":"https://arxiv.org/pdf/2409.11729v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2407.19493v2","updated":"2024-09-18T00:31:27Z","published":"2024-07-28T13:23:43Z","title":"Official-NV: An LLM-Generated News Video Dataset for Multimodal Fake\n News Detection","summary":" News media, especially video news media, have penetrated into every aspect of\ndaily life, which also brings the risk of fake news. Therefore, multimodal fake\nnews detection has recently garnered increased attention. However, the existing\ndatasets are comprised of user-uploaded videos and contain an excess amounts of\nsuperfluous data, which introduces noise into the model training process. To\naddress this issue, we construct a dataset named Official-NV, comprising\nofficially published news videos. The crawl officially published videos are\naugmented through the use of LLMs-based generation and manual verification,\nthereby expanding the dataset. Furthermore, the proposed dataset is benchmarked\nagainst several baselines to demonstrate its effectiveness in multimodal news\ndetection.\n","authors":["Yihao Wang","Lizhi Chen","Zhong Qian","Peifeng Li"],"pdf_url":"https://arxiv.org/pdf/2407.19493v2.pdf","comment":null}]},"2024-09-23T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.12181v2","updated":"2024-09-23T14:39:07Z","published":"2024-09-18T17:53:17Z","title":"A Controlled Study on Long Context Extension and Generalization in LLMs","summary":" Broad textual understanding and in-context learning require language models\nthat utilize full document contexts. Due to the implementation challenges\nassociated with directly training long-context models, many methods have been\nproposed for extending models to handle long contexts. However, owing to\ndifferences in data and model classes, it has been challenging to compare these\napproaches, leading to uncertainty as to how to evaluate long-context\nperformance and whether it differs from standard evaluation. We implement a\ncontrolled protocol for extension methods with a standardized evaluation,\nutilizing consistent base models and extension data. Our study yields several\ninsights into long-context behavior. First, we reaffirm the critical role of\nperplexity as a general-purpose performance indicator even in longer-context\ntasks. Second, we find that current approximate attention methods\nsystematically underperform across long-context tasks. Finally, we confirm that\nexact fine-tuning based methods are generally effective within the range of\ntheir extension, whereas extrapolation remains challenging. All codebases,\nmodels, and checkpoints will be made available open-source, promoting\ntransparency and facilitating further research in this critical area of AI\ndevelopment.\n","authors":["Yi Lu","Jing Nathan Yan","Songlin Yang","Justin T. Chiu","Siyu Ren","Fei Yuan","Wenting Zhao","Zhiyong Wu","Alexander M. Rush"],"pdf_url":"https://arxiv.org/pdf/2409.12181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2407.14614v3","updated":"2024-09-23T10:46:48Z","published":"2024-07-19T18:13:37Z","title":"Evaluating language models as risk scores","summary":" Current question-answering benchmarks predominantly focus on accuracy in\nrealizable prediction tasks. Conditioned on a question and answer-key, does the\nmost likely token match the ground truth? Such benchmarks necessarily fail to\nevaluate LLMs' ability to quantify ground-truth outcome uncertainty. In this\nwork, we focus on the use of LLMs as risk scores for unrealizable prediction\ntasks. We introduce folktexts, a software package to systematically generate\nrisk scores using LLMs, and evaluate them against US Census data products. A\nflexible API enables the use of different prompting schemes, local or\nweb-hosted models, and diverse census columns that can be used to compose\ncustom prediction tasks. We evaluate 17 recent LLMs across five proposed\nbenchmark tasks. We find that zero-shot risk scores produced by multiple-choice\nquestion-answering have high predictive signal but are widely miscalibrated.\nBase models consistently overestimate outcome uncertainty, while\ninstruction-tuned models underestimate uncertainty and produce over-confident\nrisk scores. In fact, instruction-tuning polarizes answer distribution\nregardless of true underlying data uncertainty. This reveals a general\ninability of instruction-tuned LLMs to express data uncertainty using\nmultiple-choice answers. A separate experiment using verbalized chat-style risk\nqueries yields substantially improved calibration across instruction-tuned\nmodels. These differences in ability to quantify data uncertainty cannot be\nrevealed in realizable settings, and highlight a blind-spot in the current\nevaluation ecosystem that folktexts covers.\n","authors":["André F. Cruz","Moritz Hardt","Celestine Mendler-Dünner"],"pdf_url":"https://arxiv.org/pdf/2407.14614v3.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.12181v2","updated":"2024-09-23T14:39:07Z","published":"2024-09-18T17:53:17Z","title":"A Controlled Study on Long Context Extension and Generalization in LLMs","summary":" Broad textual understanding and in-context learning require language models\nthat utilize full document contexts. Due to the implementation challenges\nassociated with directly training long-context models, many methods have been\nproposed for extending models to handle long contexts. However, owing to\ndifferences in data and model classes, it has been challenging to compare these\napproaches, leading to uncertainty as to how to evaluate long-context\nperformance and whether it differs from standard evaluation. We implement a\ncontrolled protocol for extension methods with a standardized evaluation,\nutilizing consistent base models and extension data. Our study yields several\ninsights into long-context behavior. First, we reaffirm the critical role of\nperplexity as a general-purpose performance indicator even in longer-context\ntasks. Second, we find that current approximate attention methods\nsystematically underperform across long-context tasks. Finally, we confirm that\nexact fine-tuning based methods are generally effective within the range of\ntheir extension, whereas extrapolation remains challenging. All codebases,\nmodels, and checkpoints will be made available open-source, promoting\ntransparency and facilitating further research in this critical area of AI\ndevelopment.\n","authors":["Yi Lu","Jing Nathan Yan","Songlin Yang","Justin T. Chiu","Siyu Ren","Fei Yuan","Wenting Zhao","Zhiyong Wu","Alexander M. Rush"],"pdf_url":"https://arxiv.org/pdf/2409.12181v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2404.13503v4","updated":"2024-09-23T17:48:54Z","published":"2024-04-21T01:53:20Z","title":"Calibration Error for Decision Making","summary":" Calibration allows predictions to be reliably interpreted as probabilities by\ndecision makers. We propose a decision-theoretic calibration error, the\nCalibration Decision Loss (CDL), defined as the maximum improvement in decision\npayoff obtained by calibrating the predictions, where the maximum is over all\npayoff-bounded decision tasks. Vanishing CDL guarantees the payoff loss from\nmiscalibration vanishes simultaneously for all downstream decision tasks. We\nshow separations between CDL and existing calibration error metrics, including\nthe most well-studied metric Expected Calibration Error (ECE). Our main\ntechnical contribution is a new efficient algorithm for online calibration that\nachieves near-optimal $O(\\frac{\\log T}{\\sqrt{T}})$ expected CDL, bypassing the\n$\\Omega(T^{-0.472})$ lower bound for ECE by Qiao and Valiant (2021).\n","authors":["Lunjia Hu","Yifan Wu"],"pdf_url":"https://arxiv.org/pdf/2404.13503v4.pdf","comment":"In FOCS 2024"},{"id":"http://arxiv.org/abs/2409.12045v2","updated":"2024-09-23T12:42:32Z","published":"2024-09-18T15:08:41Z","title":"Handling Long-Term Safety and Uncertainty in Safe Reinforcement Learning","summary":" Safety is one of the key issues preventing the deployment of reinforcement\nlearning techniques in real-world robots. While most approaches in the Safe\nReinforcement Learning area do not require prior knowledge of constraints and\nrobot kinematics and rely solely on data, it is often difficult to deploy them\nin complex real-world settings. Instead, model-based approaches that\nincorporate prior knowledge of the constraints and dynamics into the learning\nframework have proven capable of deploying the learning algorithm directly on\nthe real robot. Unfortunately, while an approximated model of the robot\ndynamics is often available, the safety constraints are task-specific and hard\nto obtain: they may be too complicated to encode analytically, too expensive to\ncompute, or it may be difficult to envision a priori the long-term safety\nrequirements. In this paper, we bridge this gap by extending the safe\nexploration method, ATACOM, with learnable constraints, with a particular focus\non ensuring long-term safety and handling of uncertainty. Our approach is\ncompetitive or superior to state-of-the-art methods in final performance while\nmaintaining safer behavior during training.\n","authors":["Jonas Günster","Puze Liu","Jan Peters","Davide Tateo"],"pdf_url":"https://arxiv.org/pdf/2409.12045v2.pdf","comment":"Preprint version of a paper accepted to the Conference on Robot\n Learning"}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.16132v2","updated":"2024-09-23T05:08:20Z","published":"2024-08-28T20:48:04Z","title":"SVDD 2024: The Inaugural Singing Voice Deepfake Detection Challenge","summary":" With the advancements in singing voice generation and the growing presence of\nAI singers on media platforms, the inaugural Singing Voice Deepfake Detection\n(SVDD) Challenge aims to advance research in identifying AI-generated singing\nvoices from authentic singers. This challenge features two tracks: a controlled\nsetting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The\nCtrSVDD track utilizes publicly available singing vocal data to generate\ndeepfakes using state-of-the-art singing voice synthesis and conversion\nsystems. Meanwhile, the WildSVDD track expands upon the existing SingFake\ndataset, which includes data sourced from popular user-generated content\nwebsites. For the CtrSVDD track, we received submissions from 47 teams, with 37\nsurpassing our baselines and the top team achieving a 1.65% equal error rate.\nFor the WildSVDD track, we benchmarked the baselines. This paper reviews these\nresults, discusses key findings, and outlines future directions for SVDD\nresearch.\n","authors":["You Zhang","Yongyi Zang","Jiatong Shi","Ryuichi Yamamoto","Tomoki Toda","Zhiyao Duan"],"pdf_url":"https://arxiv.org/pdf/2408.16132v2.pdf","comment":"6 pages, Accepted by 2024 IEEE Spoken Language Technology Workshop\n (SLT 2024)"}]},"2024-09-19T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.12097v2","updated":"2024-09-19T12:10:38Z","published":"2024-09-18T16:15:18Z","title":"Skill matching at scale: freelancer-project alignment for efficient\n multilingual candidate retrieval","summary":" Finding the perfect match between a job proposal and a set of freelancers is\nnot an easy task to perform at scale, especially in multiple languages. In this\npaper, we propose a novel neural retriever architecture that tackles this\nproblem in a multilingual setting. Our method encodes project descriptions and\nfreelancer profiles by leveraging pre-trained multilingual language models. The\nlatter are used as backbone for a custom transformer architecture that aims to\nkeep the structure of the profiles and project. This model is trained with a\ncontrastive loss on historical data. Thanks to several experiments, we show\nthat this approach effectively captures skill matching similarity and\nfacilitates efficient matching, outperforming traditional methods.\n","authors":["Warren Jouanneau","Marc Palyart","Emma Jouffroy"],"pdf_url":"https://arxiv.org/pdf/2409.12097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12046v2","updated":"2024-09-19T02:48:43Z","published":"2024-09-18T15:16:37Z","title":"Using Large Language Models to Generate Clinical Trial Tables and\n Figures","summary":" Tables, figures, and listings (TFLs) are essential tools for summarizing\nclinical trial data. Creation of TFLs for reporting activities is often a\ntime-consuming task encountered routinely during the execution of clinical\ntrials. This study explored the use of large language models (LLMs) to automate\nthe generation of TFLs through prompt engineering and few-shot transfer\nlearning. Using public clinical trial data in ADaM format, our results\ndemonstrated that LLMs can efficiently generate TFLs with prompt instructions,\nshowcasing their potential in this domain. Furthermore, we developed a\nconservational agent named Clinical Trial TFL Generation Agent: An app that\nmatches user queries to predefined prompts that produce customized programs to\ngenerate specific predefined TFLs.\n","authors":["Yumeng Yang","Peter Krusche","Kristyn Pantoja","Cheng Shi","Ethan Ludmir","Kirk Roberts","Gen Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.12046v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11272v3","updated":"2024-09-19T15:50:01Z","published":"2024-09-17T15:23:08Z","title":"LOLA -- An Open-Source Massively Multilingual Large Language Model","summary":" This paper presents LOLA, a massively multilingual large language model\ntrained on more than 160 languages using a sparse Mixture-of-Experts\nTransformer architecture. Our architectural and implementation choices address\nthe challenge of harnessing linguistic diversity while maintaining efficiency\nand avoiding the common pitfalls of multilinguality. Our analysis of the\nevaluation results shows competitive performance in natural language generation\nand understanding tasks. Additionally, we demonstrate how the learned\nexpert-routing mechanism exploits implicit phylogenetic linguistic patterns to\npotentially alleviate the curse of multilinguality. We provide an in-depth look\nat the training process, an analysis of the datasets, and a balanced\nexploration of the model's strengths and limitations. As an open-source model,\nLOLA promotes reproducibility and serves as a robust foundation for future\nresearch. Our findings enable the development of compute-efficient multilingual\nmodels with strong, scalable performance across languages.\n","authors":["Nikit Srivastava","Denis Kuchelev","Tatiana Moteu Ngoli","Kshitij Shetty","Michael Röder","Diego Moussallem","Hamada Zahera","Axel-Cyrille Ngonga Ngomo"],"pdf_url":"https://arxiv.org/pdf/2409.11272v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10795v3","updated":"2024-09-19T07:24:02Z","published":"2024-08-20T12:43:58Z","title":"Adversarial Attack for Explanation Robustness of Rationalization Models","summary":" Rationalization models, which select a subset of input text as\nrationale-crucial for humans to understand and trust predictions-have recently\nemerged as a prominent research area in eXplainable Artificial Intelligence.\nHowever, most of previous studies mainly focus on improving the quality of the\nrationale, ignoring its robustness to malicious attack. Specifically, whether\nthe rationalization models can still generate high-quality rationale under the\nadversarial attack remains unknown. To explore this, this paper proposes UAT2E,\nwhich aims to undermine the explainability of rationalization models without\naltering their predictions, thereby eliciting distrust in these models from\nhuman users. UAT2E employs the gradient-based search on triggers and then\ninserts them into the original input to conduct both the non-target and target\nattack. Experimental results on five datasets reveal the vulnerability of\nrationalization models in terms of explanation, where they tend to select more\nmeaningless tokens under attacks. Based on this, we make a series of\nrecommendations for improving rationalization models in terms of explanation.\n","authors":["Yuankai Zhang","Lingxiao Kong","Haozhao Wang","Ruixuan Li","Jun Wang","Yuhua Li","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2408.10795v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11261v3","updated":"2024-09-19T09:50:58Z","published":"2024-09-17T15:10:23Z","title":"The Art of Storytelling: Multi-Agent Generative AI for Dynamic\n Multimodal Narratives","summary":" This paper introduces the concept of an education tool that utilizes\nGenerative Artificial Intelligence (GenAI) to enhance storytelling for\nchildren. The system combines GenAI-driven narrative co-creation,\ntext-to-speech conversion, and text-to-video generation to produce an engaging\nexperience for learners. We describe the co-creation process, the adaptation of\nnarratives into spoken words using text-to-speech models, and the\ntransformation of these narratives into contextually relevant visuals through\ntext-to-video technology. Our evaluation covers the linguistics of the\ngenerated stories, the text-to-speech conversion quality, and the accuracy of\nthe generated visuals.\n","authors":["Samee Arif","Taimoor Arif","Muhammad Saad Haroon","Aamina Jamal Khan","Agha Ali Raza","Awais Athar"],"pdf_url":"https://arxiv.org/pdf/2409.11261v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11733v2","updated":"2024-09-19T02:33:54Z","published":"2024-09-18T06:42:13Z","title":"Human-like Affective Cognition in Foundation Models","summary":" Understanding emotions is fundamental to human interaction and experience.\nHumans easily infer emotions from situations or facial expressions, situations\nfrom emotions, and do a variety of other affective cognition. How adept is\nmodern AI at these inferences? We introduce an evaluation framework for testing\naffective cognition in foundation models. Starting from psychological theory,\nwe generate 1,280 diverse scenarios exploring relationships between appraisals,\nemotions, expressions, and outcomes. We evaluate the abilities of foundation\nmodels (GPT-4, Claude-3, Gemini-1.5-Pro) and humans (N = 567) across carefully\nselected conditions. Our results show foundation models tend to agree with\nhuman intuitions, matching or exceeding interparticipant agreement. In some\nconditions, models are ``superhuman'' -- they better predict modal human\njudgements than the average human. All models benefit from chain-of-thought\nreasoning. This suggests foundation models have acquired a human-like\nunderstanding of emotions and their influence on beliefs and behavior.\n","authors":["Kanishk Gandhi","Zoe Lynch","Jan-Philipp Fränken","Kayla Patterson","Sharon Wambu","Tobias Gerstenberg","Desmond C. Ong","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2409.11733v2.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2408.02615v3","updated":"2024-09-19T16:07:14Z","published":"2024-08-05T16:39:39Z","title":"LaMamba-Diff: Linear-Time High-Fidelity Diffusion Models Based on Local\n Attention and Mamba","summary":" Recent Transformer-based diffusion models have shown remarkable performance,\nlargely attributed to the ability of the self-attention mechanism to accurately\ncapture both global and local contexts by computing all-pair interactions among\ninput tokens. However, their quadratic complexity poses significant\ncomputational challenges for long-sequence inputs. Conversely, a recent state\nspace model called Mamba offers linear complexity by compressing a filtered\nglobal context into a hidden state. Despite its efficiency, compression\ninevitably leads to information loss of fine-grained local dependencies among\ntokens, which are crucial for effective visual generative modeling. Motivated\nby these observations, we introduce Local Attentional Mamba (LaMamba) blocks\nthat combine the strengths of self-attention and Mamba, capturing both global\ncontexts and local details with linear complexity. Leveraging the efficient\nU-Net architecture, our model exhibits exceptional scalability and surpasses\nthe performance of DiT across various model scales on ImageNet at 256x256\nresolution, all while utilizing substantially fewer GFLOPs and a comparable\nnumber of parameters. Compared to state-of-the-art diffusion models on ImageNet\n256x256 and 512x512, our largest model presents notable advantages, such as a\nreduction of up to 62% GFLOPs compared to DiT-XL/2, while achieving superior\nperformance with comparable or fewer parameters. Our code is available at\nhttps://github.com/yunxiangfu2001/LaMamba-Diff.\n","authors":["Yunxiang Fu","Chaoqi Chen","Yizhou Yu"],"pdf_url":"https://arxiv.org/pdf/2408.02615v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17448v3","updated":"2024-09-19T14:32:45Z","published":"2023-03-30T15:20:21Z","title":"NN-Copula-CD: A Copula-Guided Interpretable Neural Network for Change\n Detection in Heterogeneous Remote Sensing Images","summary":" Change detection (CD) in heterogeneous remote sensing images has been widely\nused for disaster monitoring and land-use management. In the past decade, the\nheterogeneous CD problem has significantly benefited from the development of\ndeep neural networks (DNNs). However, the purely data-driven DNNs perform like\na black box where the lack of interpretability limits the trustworthiness and\ncontrollability of DNNs in most practical CD applications. As a powerful\nknowledge-driven tool, copula theory performs well in modeling relationships\namong random variables. To enhance the interpretability of existing neural\nnetworks for CD, we propose a knowledge-data-driven heterogeneous CD method\nbased on a copula-guided neural network, named NN-Copula-CD. In our\nNN-Copula-CD, the mathematical characteristics of copula are employed as the\nloss functions to supervise a neural network to learn the dependence between\nbi-temporal heterogeneous superpixel pairs, and then the changed regions are\nidentified via binary classification based on the degrees of dependence of all\nthe superpixel pairs in the bi-temporal images. We conduct in-depth experiments\non three datasets with heterogeneous images, where both quantitative and visual\nresults demonstrate the effectiveness of our proposed NN-Copula-CD method.\n","authors":["Weiming Li","Xueqian Wang","Gang Li","Baocheng Geng","Pramod K. Varshney"],"pdf_url":"https://arxiv.org/pdf/2303.17448v3.pdf","comment":"The full version of this work is submitted to IEEE TGRS"},{"id":"http://arxiv.org/abs/2409.11752v2","updated":"2024-09-19T09:39:49Z","published":"2024-09-18T07:10:24Z","title":"Cross-Organ and Cross-Scanner Adenocarcinoma Segmentation using Rein to\n Fine-tune Vision Foundation Models","summary":" In recent years, significant progress has been made in tumor segmentation\nwithin the field of digital pathology. However, variations in organs, tissue\npreparation methods, and image acquisition processes can lead to domain\ndiscrepancies among digital pathology images. To address this problem, in this\npaper, we use Rein, a fine-tuning method, to parametrically and efficiently\nfine-tune various vision foundation models (VFMs) for MICCAI 2024 Cross-Organ\nand Cross-Scanner Adenocarcinoma Segmentation (COSAS2024). The core of Rein\nconsists of a set of learnable tokens, which are directly linked to instances,\nimproving functionality at the instance level in each layer. In the data\nenvironment of the COSAS2024 Challenge, extensive experiments demonstrate that\nRein fine-tuned the VFMs to achieve satisfactory results. Specifically, we used\nRein to fine-tune ConvNeXt and DINOv2. Our team used the former to achieve\nscores of 0.7719 and 0.7557 on the preliminary test phase and final test phase\nin task1, respectively, while the latter achieved scores of 0.8848 and 0.8192\non the preliminary test phase and final test phase in task2. Code is available\nat GitHub.\n","authors":["Pengzhou Cai","Xueyuan Zhang","Libin Lan","Ze Zhao"],"pdf_url":"https://arxiv.org/pdf/2409.11752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11738v2","updated":"2024-09-19T03:54:58Z","published":"2024-09-18T06:51:29Z","title":"Adaptive Selection of Sampling-Reconstruction in Fourier Compressed\n Sensing","summary":" Compressed sensing (CS) has emerged to overcome the inefficiency of Nyquist\nsampling. However, traditional optimization-based reconstruction is slow and\ncan not yield an exact image in practice. Deep learning-based reconstruction\nhas been a promising alternative to optimization-based reconstruction,\noutperforming it in accuracy and computation speed. Finding an efficient\nsampling method with deep learning-based reconstruction, especially for Fourier\nCS remains a challenge. Existing joint optimization of sampling-reconstruction\nworks ($\\mathcal{H}_1$) optimize the sampling mask but have low potential as it\nis not adaptive to each data point. Adaptive sampling ($\\mathcal{H}_2$) has\nalso disadvantages of difficult optimization and Pareto sub-optimality. Here,\nwe propose a novel adaptive selection of sampling-reconstruction\n($\\mathcal{H}_{1.5}$) framework that selects the best sampling mask and\nreconstruction network for each input data. We provide theorems that our method\nhas a higher potential than $\\mathcal{H}_1$ and effectively solves the Pareto\nsub-optimality problem in sampling-reconstruction by using separate\nreconstruction networks for different sampling masks. To select the best\nsampling mask, we propose to quantify the high-frequency Bayesian uncertainty\nof the input, using a super-resolution space generation model. Our method\noutperforms joint optimization of sampling-reconstruction ($\\mathcal{H}_1$) and\nadaptive sampling ($\\mathcal{H}_2$) by achieving significant improvements on\nseveral Fourier CS problems.\n","authors":["Seongmin Hong","Jaehyeok Bae","Jongho Lee","Se Young Chun"],"pdf_url":"https://arxiv.org/pdf/2409.11738v2.pdf","comment":"30 pages, 9.8 MB, Accepted to ECCV 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.12097v2","updated":"2024-09-19T12:10:38Z","published":"2024-09-18T16:15:18Z","title":"Skill matching at scale: freelancer-project alignment for efficient\n multilingual candidate retrieval","summary":" Finding the perfect match between a job proposal and a set of freelancers is\nnot an easy task to perform at scale, especially in multiple languages. In this\npaper, we propose a novel neural retriever architecture that tackles this\nproblem in a multilingual setting. Our method encodes project descriptions and\nfreelancer profiles by leveraging pre-trained multilingual language models. The\nlatter are used as backbone for a custom transformer architecture that aims to\nkeep the structure of the profiles and project. This model is trained with a\ncontrastive loss on historical data. Thanks to several experiments, we show\nthat this approach effectively captures skill matching similarity and\nfacilitates efficient matching, outperforming traditional methods.\n","authors":["Warren Jouanneau","Marc Palyart","Emma Jouffroy"],"pdf_url":"https://arxiv.org/pdf/2409.12097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11690v2","updated":"2024-09-19T03:28:21Z","published":"2024-09-18T04:10:44Z","title":"LLM-Powered Text Simulation Attack Against ID-Free Recommender Systems","summary":" The ID-free recommendation paradigm has been proposed to address the\nlimitation that traditional recommender systems struggle to model cold-start\nusers or items with new IDs. Despite its effectiveness, this study uncovers\nthat ID-free recommender systems are vulnerable to the proposed Text Simulation\nattack (TextSimu) which aims to promote specific target items. As a novel type\nof text poisoning attack, TextSimu exploits large language models (LLM) to\nalter the textual information of target items by simulating the characteristics\nof popular items. It operates effectively in both black-box and white-box\nsettings, utilizing two key components: a unified popularity extraction module,\nwhich captures the essential characteristics of popular items, and an N-persona\nconsistency simulation strategy, which creates multiple personas to\ncollaboratively synthesize refined promotional textual descriptions for target\nitems by simulating the popular items. To withstand TextSimu-like attacks, we\nfurther explore the detection approach for identifying LLM-generated\npromotional text. Extensive experiments conducted on three datasets demonstrate\nthat TextSimu poses a more significant threat than existing poisoning attacks,\nwhile our defense method can detect malicious text of target items generated by\nTextSimu. By identifying the vulnerability, we aim to advance the development\nof more robust ID-free recommender systems.\n","authors":["Zongwei Wang","Min Gao","Junliang Yu","Xinyi Gao","Quoc Viet Hung Nguyen","Shazia Sadiq","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2409.11690v2.pdf","comment":"12 pages"},{"id":"http://arxiv.org/abs/2409.10173v3","updated":"2024-09-19T11:21:24Z","published":"2024-09-16T11:10:29Z","title":"jina-embeddings-v3: Multilingual Embeddings With Task LoRA","summary":" We introduce jina-embeddings-v3, a novel text embedding model with 570\nmillion parameters, achieves state-of-the-art performance on multilingual data\nand long-context retrieval tasks, supporting context lengths of up to 8192\ntokens. The model includes a set of task-specific Low-Rank Adaptation (LoRA)\nadapters to generate high-quality embeddings for query-document retrieval,\nclustering, classification, and text matching. Evaluation on the MTEB benchmark\nshows that jina-embeddings-v3 outperforms the latest proprietary embeddings\nfrom OpenAI and Cohere on English tasks, while achieving superior performance\ncompared to multilingual-e5-large-instruct across all multilingual tasks. With\na default output dimension of 1024, users can flexibly reduce the embedding\ndimensions to as low as 32 without compromising performance, enabled by\nMatryoshka Representation Learning.\n","authors":["Saba Sturua","Isabelle Mohr","Mohammad Kalim Akram","Michael Günther","Bo Wang","Markus Krimmel","Feng Wang","Georgios Mastrapas","Andreas Koukounas","Nan Wang","Han Xiao"],"pdf_url":"https://arxiv.org/pdf/2409.10173v3.pdf","comment":"20 pages, pp11-13 references, pp14-20 appendix and experiment tables"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.12097v2","updated":"2024-09-19T12:10:38Z","published":"2024-09-18T16:15:18Z","title":"Skill matching at scale: freelancer-project alignment for efficient\n multilingual candidate retrieval","summary":" Finding the perfect match between a job proposal and a set of freelancers is\nnot an easy task to perform at scale, especially in multiple languages. In this\npaper, we propose a novel neural retriever architecture that tackles this\nproblem in a multilingual setting. Our method encodes project descriptions and\nfreelancer profiles by leveraging pre-trained multilingual language models. The\nlatter are used as backbone for a custom transformer architecture that aims to\nkeep the structure of the profiles and project. This model is trained with a\ncontrastive loss on historical data. Thanks to several experiments, we show\nthat this approach effectively captures skill matching similarity and\nfacilitates efficient matching, outperforming traditional methods.\n","authors":["Warren Jouanneau","Marc Palyart","Emma Jouffroy"],"pdf_url":"https://arxiv.org/pdf/2409.12097v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.12089v2","updated":"2024-09-19T05:44:21Z","published":"2024-09-18T16:04:10Z","title":"The Impact of Element Ordering on LM Agent Performance","summary":" There has been a surge of interest in language model agents that can navigate\nvirtual environments such as the web or desktop. To navigate such environments,\nagents benefit from information on the various elements (e.g., buttons, text,\nor images) present. It remains unclear which element attributes have the\ngreatest impact on agent performance, especially in environments that only\nprovide a graphical representation (i.e., pixels). Here we find that the\nordering in which elements are presented to the language model is surprisingly\nimpactful--randomizing element ordering in a webpage degrades agent performance\ncomparably to removing all visible text from an agent's state representation.\nWhile a webpage provides a hierarchical ordering of elements, there is no such\nordering when parsing elements directly from pixels. Moreover, as tasks become\nmore challenging and models more sophisticated, our experiments suggest that\nthe impact of ordering increases. Finding an effective ordering is non-trivial.\nWe investigate the impact of various element ordering methods in web and\ndesktop environments. We find that dimensionality reduction provides a viable\nordering for pixel-only environments. We train a UI element detection model to\nderive elements from pixels and apply our findings to an agent\nbenchmark--OmniACT--where we only have access to pixels. Our method completes\nmore than two times as many tasks on average relative to the previous\nstate-of-the-art.\n","authors":["Wayne Chi","Ameet Talwalkar","Chris Donahue"],"pdf_url":"https://arxiv.org/pdf/2409.12089v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.17448v3","updated":"2024-09-19T14:32:45Z","published":"2023-03-30T15:20:21Z","title":"NN-Copula-CD: A Copula-Guided Interpretable Neural Network for Change\n Detection in Heterogeneous Remote Sensing Images","summary":" Change detection (CD) in heterogeneous remote sensing images has been widely\nused for disaster monitoring and land-use management. In the past decade, the\nheterogeneous CD problem has significantly benefited from the development of\ndeep neural networks (DNNs). However, the purely data-driven DNNs perform like\na black box where the lack of interpretability limits the trustworthiness and\ncontrollability of DNNs in most practical CD applications. As a powerful\nknowledge-driven tool, copula theory performs well in modeling relationships\namong random variables. To enhance the interpretability of existing neural\nnetworks for CD, we propose a knowledge-data-driven heterogeneous CD method\nbased on a copula-guided neural network, named NN-Copula-CD. In our\nNN-Copula-CD, the mathematical characteristics of copula are employed as the\nloss functions to supervise a neural network to learn the dependence between\nbi-temporal heterogeneous superpixel pairs, and then the changed regions are\nidentified via binary classification based on the degrees of dependence of all\nthe superpixel pairs in the bi-temporal images. We conduct in-depth experiments\non three datasets with heterogeneous images, where both quantitative and visual\nresults demonstrate the effectiveness of our proposed NN-Copula-CD method.\n","authors":["Weiming Li","Xueqian Wang","Gang Li","Baocheng Geng","Pramod K. Varshney"],"pdf_url":"https://arxiv.org/pdf/2303.17448v3.pdf","comment":"The full version of this work is submitted to IEEE TGRS"},{"id":"http://arxiv.org/abs/2409.11272v3","updated":"2024-09-19T15:50:01Z","published":"2024-09-17T15:23:08Z","title":"LOLA -- An Open-Source Massively Multilingual Large Language Model","summary":" This paper presents LOLA, a massively multilingual large language model\ntrained on more than 160 languages using a sparse Mixture-of-Experts\nTransformer architecture. Our architectural and implementation choices address\nthe challenge of harnessing linguistic diversity while maintaining efficiency\nand avoiding the common pitfalls of multilinguality. Our analysis of the\nevaluation results shows competitive performance in natural language generation\nand understanding tasks. Additionally, we demonstrate how the learned\nexpert-routing mechanism exploits implicit phylogenetic linguistic patterns to\npotentially alleviate the curse of multilinguality. We provide an in-depth look\nat the training process, an analysis of the datasets, and a balanced\nexploration of the model's strengths and limitations. As an open-source model,\nLOLA promotes reproducibility and serves as a robust foundation for future\nresearch. Our findings enable the development of compute-efficient multilingual\nmodels with strong, scalable performance across languages.\n","authors":["Nikit Srivastava","Denis Kuchelev","Tatiana Moteu Ngoli","Kshitij Shetty","Michael Röder","Diego Moussallem","Hamada Zahera","Axel-Cyrille Ngonga Ngomo"],"pdf_url":"https://arxiv.org/pdf/2409.11272v3.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2408.15521v2","updated":"2024-09-19T06:21:03Z","published":"2024-08-28T04:14:01Z","title":"A Simple Baseline with Single-encoder for Referring Image Segmentation","summary":" Referring image segmentation (RIS) requires dense vision-language\ninteractions between visual pixels and textual words to segment objects based\non a given description. However, commonly adapted dual-encoders in RIS, e.g.,\nSwin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal\ndual-encoder), lack dense multi-modal interactions during pre-training, leading\nto a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods\noften rely on multi-modal fusion modules that interact two encoders, but this\napproach leads to high computational costs. In this paper, we present a novel\nRIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of\nshared self-attention across all framework components. This enables seamless\ninteractions of two modalities from input to final prediction, producing\ngranularly aligned multi-modal features. Furthermore, we propose lightweight\nyet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which\ncontribute to the high efficiency of our model. Our simple baseline with a\nsingle encoder achieves outstanding performances on the RIS benchmark datasets\nwhile maintaining computational efficiency, compared to the most recent SoTA\nmethods based on dual-encoders.\n","authors":["Seonghoon Yu","Ilchae Jung","Byeongju Han","Taeoh Kim","Yunho Kim","Dongyoon Wee","Jeany Son"],"pdf_url":"https://arxiv.org/pdf/2408.15521v2.pdf","comment":"arXiv pre-print"}]},"2024-09-17T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2409.11598v1","updated":"2024-09-17T23:10:04Z","published":"2024-09-17T23:10:04Z","title":"Towards Fair RAG: On the Impact of Fair Ranking in Retrieval-Augmented\n Generation","summary":" Many language models now enhance their responses with retrieval capabilities,\nleading to the widespread adoption of retrieval-augmented generation (RAG)\nsystems. However, despite retrieval being a core component of RAG, much of the\nresearch in this area overlooks the extensive body of work on fair ranking,\nneglecting the importance of considering all stakeholders involved. This paper\npresents the first systematic evaluation of RAG systems integrated with fair\nrankings. We focus specifically on measuring the fair exposure of each relevant\nitem across the rankings utilized by RAG systems (i.e., item-side fairness),\naiming to promote equitable growth for relevant item providers. To gain a deep\nunderstanding of the relationship between item-fairness, ranking quality, and\ngeneration quality in the context of RAG, we analyze nine different RAG systems\nthat incorporate fair rankings across seven distinct datasets. Our findings\nindicate that RAG systems with fair rankings can maintain a high level of\ngeneration quality and, in many cases, even outperform traditional RAG systems,\ndespite the general trend of a tradeoff between ensuring fairness and\nmaintaining system-effectiveness. We believe our insights lay the groundwork\nfor responsible and equitable RAG systems and open new avenues for future\nresearch. We publicly release our codebase and dataset at\nhttps://github.com/kimdanny/Fair-RAG.\n","authors":["To Eun Kim","Fernando Diaz"],"pdf_url":"https://arxiv.org/pdf/2409.11598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11589v1","updated":"2024-09-17T22:34:33Z","published":"2024-09-17T22:34:33Z","title":"ProSLM : A Prolog Synergized Language Model for explainable Domain\n Specific Knowledge Based Question Answering","summary":" Neurosymbolic approaches can add robustness to opaque neural systems by\nincorporating explainable symbolic representations. However, previous\napproaches have not used formal logic to contextualize queries to and validate\noutputs of large language models (LLMs). We propose \\systemname{}, a novel\nneurosymbolic framework, to improve the robustness and reliability of LLMs in\nquestion-answering tasks. We provide \\systemname{} with a domain-specific\nknowledge base, a logical reasoning system, and an integration to an existing\nLLM. This framework has two capabilities (1) context gathering: generating\nexplainable and relevant context for a given query, and (2) validation:\nconfirming and validating the factual accuracy of a statement in accordance\nwith a knowledge base (KB). Our work opens a new area of neurosymbolic\ngenerative AI text validation and user personalization.\n","authors":["Priyesh Vakharia","Abigail Kufeldt","Max Meyers","Ian Lane","Leilani Gilpin"],"pdf_url":"https://arxiv.org/pdf/2409.11589v1.pdf","comment":"Accepted at NeSy 2024"},{"id":"http://arxiv.org/abs/2409.02257v2","updated":"2024-09-17T22:26:51Z","published":"2024-09-03T19:31:03Z","title":"MMLU-Pro+: Evaluating Higher-Order Reasoning and Shortcut Learning in\n LLMs","summary":" Existing benchmarks for large language models (LLMs) increasingly struggle to\ndifferentiate between top-performing models, underscoring the need for more\nchallenging evaluation frameworks. We introduce MMLU-Pro+, an enhanced\nbenchmark building upon MMLU-Pro to assess shortcut learning and higher-order\nreasoning in LLMs. By incorporating questions with multiple correct answers\nacross diverse domains, MMLU-Pro+ tests LLMs' ability to engage in complex\nreasoning and resist simplistic problem-solving strategies. Our results show\nthat MMLU-Pro+ maintains MMLU-Pro's difficulty while providing a more rigorous\ntest of model discrimination, particularly in multi-correct answer scenarios.\nWe introduce novel metrics like shortcut selection ratio and correct pair\nidentification ratio, offering deeper insights into model behavior and\nanchoring bias. Evaluations of six state-of-the-art LLMs reveal significant\nperformance gaps, highlighting variations in reasoning abilities and bias\nsusceptibility. We release the dataset and evaluation codes at\n\\url{https://github.com/asgsaeid/mmlu-pro-plus}.\n","authors":["Saeid Asgari Taghanaki","Aliasgahr Khani","Amir Khasahmadi"],"pdf_url":"https://arxiv.org/pdf/2409.02257v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.13940v2","updated":"2024-09-17T22:19:17Z","published":"2024-08-25T21:20:17Z","title":"CoT Rerailer: Enhancing the Reliability of Large Language Models in\n Complex Reasoning Tasks through Error Detection and Correction","summary":" Chain-of-Thought (CoT) prompting enhances Large Language Models (LLMs)\ncomplex reasoning abilities by generating intermediate steps. However, these\nsteps can introduce hallucinations and accumulate errors. We propose the CoT\nRerailer to address these challenges, employing self-consistency and\nmulti-agent debate systems to identify and rectify errors in the reasoning\nprocess. The CoT Rerailer first selects the most logically correct Reasoning\nPath (RP) using consistency checks and critical evaluation by automated agents.\nIt then engages a multi-agent debate system to propose and validate corrections\nto ensure the generation of an error-free intermediate logical path. The\ncorrected steps are then used to generate a revised reasoning chain to further\nreduce hallucinations and enhance answer quality. We demonstrate the\neffectiveness of our approach across diverse question-answering datasets in\nvarious knowledge domains. The CoT Rerailer enhances the reliability of\nLLM-generated reasoning, contributing to more trustworthy AI driven\ndecision-making processes.\n","authors":["Guangya Wan","Yuqi Wu","Jie Chen","Sheng Li"],"pdf_url":"https://arxiv.org/pdf/2408.13940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11579v1","updated":"2024-09-17T22:06:46Z","published":"2024-09-17T22:06:46Z","title":"HEARTS: A Holistic Framework for Explainable, Sustainable and Robust\n Text Stereotype Detection","summary":" Stereotypes are generalised assumptions about societal groups, and even\nstate-of-the-art LLMs using in-context learning struggle to identify them\naccurately. Due to the subjective nature of stereotypes, where what constitutes\na stereotype can vary widely depending on cultural, social, and individual\nperspectives, robust explainability is crucial. Explainable models ensure that\nthese nuanced judgments can be understood and validated by human users,\npromoting trust and accountability. We address these challenges by introducing\nHEARTS (Holistic Framework for Explainable, Sustainable, and Robust Text\nStereotype Detection), a framework that enhances model performance, minimises\ncarbon footprint, and provides transparent, interpretable explanations. We\nestablish the Expanded Multi-Grain Stereotype Dataset (EMGSD), comprising\n57,201 labeled texts across six groups, including under-represented\ndemographics like LGBTQ+ and regional stereotypes. Ablation studies confirm\nthat BERT models fine-tuned on EMGSD outperform those trained on individual\ncomponents. We then analyse a fine-tuned, carbon-efficient ALBERT-V2 model\nusing SHAP to generate token-level importance values, ensuring alignment with\nhuman understanding, and calculate explainability confidence scores by\ncomparing SHAP and LIME outputs. Finally, HEARTS is applied to assess\nstereotypical bias in 12 LLM outputs, revealing a gradual reduction in bias\nover time within model families.\n","authors":["Theo King","Zekun Wu","Adriano Koshiyama","Emre Kazim","Philip Treleaven"],"pdf_url":"https://arxiv.org/pdf/2409.11579v1.pdf","comment":"Submitted to NeurIPS 2024 SoLaR Workshop"},{"id":"http://arxiv.org/abs/2311.09336v4","updated":"2024-09-17T21:33:49Z","published":"2023-11-15T19:52:11Z","title":"Fine-grained LLM Agent: Pinpointing and Refining Large Language Models\n via Fine-Grained Actionable Feedback","summary":" Recent large language models (LLM) are leveraging human feedback to improve\ntheir generation quality. However, human feedback is costly to obtain,\nespecially during inference. In this work, we propose Fine-grained LLM agent,\nan inference time optimization method to refine LLM's output. The core idea is\nto use a learned fine-grained feedback model to pinpoint defects and guide LLM\nto refine them iteratively. Using original LLM as a proposal of edits,\nFine-grained LLM agent searches for defect-less text via simulated annealing,\ntrading off the exploration and exploitation. We conduct experiments on three\ntext generation tasks, including machine translation, long-form question\nanswering (QA), and topical summarization. Fine-grained LLM agent consistently\noutperforms all baseline approaches, achieving improvements up to 1.7 MetricX\npoints on translation tasks, 8.1 ROUGE-L on ASQA, 2.2 ROUGE-L on topical\nsummarization.\n","authors":["Wenda Xu","Daniel Deutsch","Mara Finkelstein","Juraj Juraska","Biao Zhang","Zhongtao Liu","William Yang Wang","Lei Li","Markus Freitag"],"pdf_url":"https://arxiv.org/pdf/2311.09336v4.pdf","comment":"Accepted to NAACL 2024"},{"id":"http://arxiv.org/abs/2406.17967v2","updated":"2024-09-17T21:29:13Z","published":"2024-06-25T22:49:17Z","title":"Unmasking the Imposters: How Censorship and Domain Adaptation Affect the\n Detection of Machine-Generated Tweets","summary":" The rapid development of large language models (LLMs) has significantly\nimproved the generation of fluent and convincing text, raising concerns about\ntheir potential misuse on social media platforms. We present a comprehensive\nmethodology for creating nine Twitter datasets to examine the generative\ncapabilities of four prominent LLMs: Llama 3, Mistral, Qwen2, and GPT4o. These\ndatasets encompass four censored and five uncensored model configurations,\nincluding 7B and 8B parameter base-instruction models of the three open-source\nLLMs. Additionally, we perform a data quality analysis to assess the\ncharacteristics of textual outputs from human, \"censored,\" and \"uncensored\"\nmodels, employing semantic meaning, lexical richness, structural patterns,\ncontent characteristics, and detector performance metrics to identify\ndifferences and similarities. Our evaluation demonstrates that \"uncensored\"\nmodels significantly undermine the effectiveness of automated detection\nmethods. This study addresses a critical gap by exploring smaller open-source\nmodels and the ramifications of \"uncensoring,\" providing valuable insights into\nhow domain adaptation and content moderation strategies influence both the\ndetectability and structural characteristics of machine-generated text.\n","authors":["Bryan E. Tuck","Rakesh M. Verma"],"pdf_url":"https://arxiv.org/pdf/2406.17967v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11564v1","updated":"2024-09-17T21:28:51Z","published":"2024-09-17T21:28:51Z","title":"Preference Tuning with Human Feedback on Language, Speech, and Vision\n Tasks: A Survey","summary":" Preference tuning is a crucial process for aligning deep generative models\nwith human preferences. This survey offers a thorough overview of recent\nadvancements in preference tuning and the integration of human feedback. The\npaper is organized into three main sections: 1) introduction and preliminaries:\nan introduction to reinforcement learning frameworks, preference tuning tasks,\nmodels, and datasets across various modalities: language, speech, and vision,\nas well as different policy approaches, 2) in-depth examination of each\npreference tuning approach: a detailed analysis of the methods used in\npreference tuning, and 3) applications, discussion, and future directions: an\nexploration of the applications of preference tuning in downstream tasks,\nincluding evaluation methods for different modalities, and an outlook on future\nresearch directions. Our objective is to present the latest methodologies in\npreference tuning and model alignment, enhancing the understanding of this\nfield for researchers and practitioners. We hope to encourage further\nengagement and innovation in this area.\n","authors":["Genta Indra Winata","Hanyang Zhao","Anirban Das","Wenpin Tang","David D. Yao","Shi-Xiong Zhang","Sambit Sahu"],"pdf_url":"https://arxiv.org/pdf/2409.11564v1.pdf","comment":"Survey paper"},{"id":"http://arxiv.org/abs/2406.10984v2","updated":"2024-09-17T20:47:14Z","published":"2024-06-16T15:44:37Z","title":"Revisiting Cosine Similarity via Normalized ICA-transformed Embeddings","summary":" Cosine similarity is widely used to measure the similarity between two\nembeddings, while interpretations based on angle and correlation coefficient\nare common. In this study, we focus on the interpretable axes of embeddings\ntransformed by Independent Component Analysis (ICA), and propose a novel\ninterpretation of cosine similarity as the sum of semantic similarities over\naxes. The normalized ICA-transformed embeddings exhibit sparsity, enhancing the\ninterpretability of each axis, and the semantic similarity defined by the\nproduct of the components represents the shared meaning between the two\nembeddings along each axis. The effectiveness of this approach is demonstrated\nthrough intuitive numerical examples and thorough numerical experiments. By\nderiving the probability distributions that govern each component and the\nproduct of components, we propose a method for selecting statistically\nsignificant axes.\n","authors":["Hiroaki Yamagiwa","Momose Oyama","Hidetoshi Shimodaira"],"pdf_url":"https://arxiv.org/pdf/2406.10984v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11547v1","updated":"2024-09-17T20:40:02Z","published":"2024-09-17T20:40:02Z","title":"Small Language Models can Outperform Humans in Short Creative Writing: A\n Study Comparing SLMs with Humans and LLMs","summary":" In this paper, we evaluate the creative fiction writing abilities of a\nfine-tuned small language model (SLM), BART Large, and compare its performance\nto humans and two large language models (LLMs): GPT-3.5 and GPT-4o. Our\nevaluation consists of two experiments: (i) a human evaluation where readers\nassess the stories generated by the SLM compared to human-written stories, and\n(ii) a qualitative linguistic analysis comparing the textual characteristics of\nthe stories generated by the different models. In the first experiment, we\nasked 68 participants to rate short stories generated by the models and humans\nalong dimensions such as grammaticality, relevance, creativity, and\nattractiveness. BART Large outperformed human writers in most aspects, except\ncreativity, with an overall score of 2.11 compared to 1.85 for human-written\ntexts -- a 14% improvement. In the second experiment, the qualitative analysis\nrevealed that, while GPT-4o exhibited near-perfect internal and external\ncoherence, it tended to produce more predictable narratives, with only 3% of\nits stories seen as novel. In contrast, 15% of BART's stories were considered\nnovel, indicating a higher degree of creativity despite its smaller model size.\nThis study provides both quantitative and qualitative insights into how model\nsize and fine-tuning influence the balance between creativity, fluency, and\ncoherence in creative writing tasks.\n","authors":["Guillermo Marco","Luz Rello","Julio Gonzalo"],"pdf_url":"https://arxiv.org/pdf/2409.11547v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11538v1","updated":"2024-09-17T20:16:43Z","published":"2024-09-17T20:16:43Z","title":"Chain-of-Thought Prompting for Speech Translation","summary":" Large language models (LLMs) have demonstrated remarkable advancements in\nlanguage understanding and generation. Building on the success of text-based\nLLMs, recent research has adapted these models to use speech embeddings for\nprompting, resulting in Speech-LLM models that exhibit strong performance in\nautomatic speech recognition (ASR) and automatic speech translation (AST). In\nthis work, we propose a novel approach to leverage ASR transcripts as prompts\nfor AST in a Speech-LLM built on an encoder-decoder text LLM. The Speech-LLM\nmodel consists of a speech encoder and an encoder-decoder structure\nMegatron-T5. By first decoding speech to generate ASR transcripts and\nsubsequently using these transcripts along with encoded speech for prompting,\nwe guide the speech translation in a two-step process like chain-of-thought\n(CoT) prompting. Low-rank adaptation (LoRA) is used for the T5 LLM for model\nadaptation and shows superior performance to full model fine-tuning.\nExperimental results show that the proposed CoT prompting significantly\nimproves AST performance, achieving an average increase of 2.4 BLEU points\nacross 6 En->X or X->En AST tasks compared to speech prompting alone.\nAdditionally, compared to a related CoT prediction method that predicts a\nconcatenated sequence of ASR and AST transcripts, our method performs better by\nan average of 2 BLEU points.\n","authors":["Ke Hu","Zhehuai Chen","Chao-Han Huck Yang","Piotr Żelasko","Oleksii Hrinchuk","Vitaly Lavrukhin","Jagadeesh Balam","Boris Ginsburg"],"pdf_url":"https://arxiv.org/pdf/2409.11538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00236v4","updated":"2024-09-17T19:56:09Z","published":"2023-09-01T03:53:40Z","title":"Image Hijacks: Adversarial Images can Control Generative Models at\n Runtime","summary":" Are foundation models secure against malicious actors? In this work, we focus\non the image input to a vision-language model (VLM). We discover image hijacks,\nadversarial images that control the behaviour of VLMs at inference time, and\nintroduce the general Behaviour Matching algorithm for training image hijacks.\nFrom this, we derive the Prompt Matching method, allowing us to train hijacks\nmatching the behaviour of an arbitrary user-defined text prompt (e.g. 'the\nEiffel Tower is now located in Rome') using a generic, off-the-shelf dataset\nunrelated to our choice of prompt. We use Behaviour Matching to craft hijacks\nfor four types of attack, forcing VLMs to generate outputs of the adversary's\nchoice, leak information from their context window, override their safety\ntraining, and believe false statements. We study these attacks against LLaVA, a\nstate-of-the-art VLM based on CLIP and LLaMA-2, and find that all attack types\nachieve a success rate of over 80%. Moreover, our attacks are automated and\nrequire only small image perturbations.\n","authors":["Luke Bailey","Euan Ong","Stuart Russell","Scott Emmons"],"pdf_url":"https://arxiv.org/pdf/2309.00236v4.pdf","comment":"Project page at https://image-hijacks.github.io"},{"id":"http://arxiv.org/abs/2409.11501v1","updated":"2024-09-17T19:05:37Z","published":"2024-09-17T19:05:37Z","title":"Egalitarian Language Representation in Language Models: It All Begins\n with Tokenizers","summary":" Tokenizers act as a bridge between human language and the latent space of\nlanguage models, influencing how language is represented in these models. Due\nto the immense popularity of English-Centric Large Language Models (LLMs),\nefforts are being made to adapt them for other languages. However, we\ndemonstrate that, from a tokenization standpoint, not all tokenizers offer fair\nrepresentation for complex script languages such as Tamil, Sinhala, and Hindi,\nprimarily due to the choice of pre-tokenization methods. We go further to show\nthat pre-tokenization plays a more critical role than the tokenization\nalgorithm itself in achieving an egalitarian representation of these complex\nscript languages. To address this, we introduce an improvement to the Byte Pair\nEncoding (BPE) algorithm by incorporating graphemes, which we term Grapheme\nPair Encoding (GPE). Our experiments show that grapheme-based character\nextraction outperforms byte-level tokenizers for complex scripts. We validate\nthis approach through experiments on Tamil, Sinhala, and Hindi.\n","authors":["Menan Velayuthan","Kengatharaiyer Sarveswaran"],"pdf_url":"https://arxiv.org/pdf/2409.11501v1.pdf","comment":"Content - 8 pages, References - 3 pages"},{"id":"http://arxiv.org/abs/2409.11500v1","updated":"2024-09-17T19:02:39Z","published":"2024-09-17T19:02:39Z","title":"Multi-Document Grounded Multi-Turn Synthetic Dialog Generation","summary":" We introduce a technique for multi-document grounded multi-turn synthetic\ndialog generation that incorporates three main ideas. First, we control the\noverall dialog flow using taxonomy-driven user queries that are generated with\nChain-of-Thought (CoT) prompting. Second, we support the generation of\nmulti-document grounded dialogs by mimicking real-world use of retrievers to\nupdate the grounding documents after every user-turn in the dialog. Third, we\napply LLM-as-a-Judge to filter out queries with incorrect answers. Human\nevaluation of the synthetic dialog data suggests that the data is diverse,\ncoherent, and includes mostly correct answers. Both human and automatic\nevaluations of answerable queries indicate that models fine-tuned on synthetic\ndialogs consistently out-perform those fine-tuned on existing human generated\ntraining data across four publicly available multi-turn document grounded\nbenchmark test sets.\n","authors":["Young-Suk Lee","Chulaka Gunasekara","Danish Contractor","Ramón Fernandez Astudillo","Radu Florian"],"pdf_url":"https://arxiv.org/pdf/2409.11500v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11498v1","updated":"2024-09-17T19:00:21Z","published":"2024-09-17T19:00:21Z","title":"Augment, Drop & Swap: Improving Diversity in LLM Captions for Efficient\n Music-Text Representation Learning","summary":" Audio-text contrastive models have become a powerful approach in music\nrepresentation learning. Despite their empirical success, however, little is\nknown about the influence of key design choices on the quality of music-text\nrepresentations learnt through this framework. In this work, we expose these\ndesign choices within the constraints of limited data and computation budgets,\nand establish a more solid understanding of their impact grounded in empirical\nobservations along three axes: the choice of base encoders, the level of\ncuration in training data, and the use of text augmentation. We find that data\ncuration is the single most important factor for music-text contrastive\ntraining in resource-constrained scenarios. Motivated by this insight, we\nintroduce two novel techniques, Augmented View Dropout and TextSwap, which\nincrease the diversity and descriptiveness of text inputs seen in training.\nThrough our experiments we demonstrate that these are effective at boosting\nperformance across different pre-training regimes, model architectures, and\ndownstream data distributions, without incurring higher computational costs or\nrequiring additional training data.\n","authors":["Ilaria Manco","Justin Salamon","Oriol Nieto"],"pdf_url":"https://arxiv.org/pdf/2409.11498v1.pdf","comment":"To appear in the Proceedings of the 25th International Society for\n Music Information Retrieval Conference (ISMIR 2024)"},{"id":"http://arxiv.org/abs/2406.15045v2","updated":"2024-09-17T18:57:49Z","published":"2024-06-21T10:48:21Z","title":"Integrating Knowledge Retrieval and Large Language Models for Clinical\n Report Correction","summary":" This study proposes an approach for error correction in radiology reports,\nleveraging large language models (LLMs) and retrieval-augmented generation\n(RAG) techniques. The proposed framework employs a novel internal+external\nretrieval mechanism to extract relevant medical entities and relations from the\nreport of interest and an external knowledge source. A three-stage inference\nprocess is introduced, decomposing the task into error detection, localization,\nand correction subtasks, which enhances the explainability and performance of\nthe system. The effectiveness of the approach is evaluated using a benchmark\ndataset created by corrupting real-world radiology reports with realistic\nerrors, guided by domain experts. Experimental results demonstrate the benefits\nof the proposed methods, with the combination of internal and external\nretrieval significantly improving the accuracy of error detection,\nlocalization, and correction across various state-of-the-art LLMs. The findings\ncontribute to the development of more robust and reliable error correction\nsystems for clinical documentation.\n","authors":["Jinge Wu","Zhaolong Wu","Ruizhe Li","Abul Hasan","Yunsoo Kim","Jason P. Y. Cheung","Teng Zhang","Honghan Wu"],"pdf_url":"https://arxiv.org/pdf/2406.15045v2.pdf","comment":"v2"},{"id":"http://arxiv.org/abs/2409.11491v1","updated":"2024-09-17T18:40:49Z","published":"2024-09-17T18:40:49Z","title":"Enriching Datasets with Demographics through Large Language Models:\n What's in a Name?","summary":" Enriching datasets with demographic information, such as gender, race, and\nage from names, is a critical task in fields like healthcare, public policy,\nand social sciences. Such demographic insights allow for more precise and\neffective engagement with target populations. Despite previous efforts\nemploying hidden Markov models and recurrent neural networks to predict\ndemographics from names, significant limitations persist: the lack of\nlarge-scale, well-curated, unbiased, publicly available datasets, and the lack\nof an approach robust across datasets. This scarcity has hindered the\ndevelopment of traditional supervised learning approaches. In this paper, we\ndemonstrate that the zero-shot capabilities of Large Language Models (LLMs) can\nperform as well as, if not better than, bespoke models trained on specialized\ndata. We apply these LLMs to a variety of datasets, including a real-life,\nunlabelled dataset of licensed financial professionals in Hong Kong, and\ncritically assess the inherent demographic biases in these models. Our work not\nonly advances the state-of-the-art in demographic enrichment but also opens\navenues for future research in mitigating biases in LLMs.\n","authors":["Khaled AlNuaimi","Gautier Marti","Mathieu Ravaut","Abdulla AlKetbi","Andreas Henschel","Raed Jaradat"],"pdf_url":"https://arxiv.org/pdf/2409.11491v1.pdf","comment":"8 pages, 7 Tables, 5 Figures"},{"id":"http://arxiv.org/abs/2409.11404v1","updated":"2024-09-17T17:59:25Z","published":"2024-09-17T17:59:25Z","title":"AraDiCE: Benchmarks for Dialectal and Cultural Capabilities in LLMs","summary":" Arabic, with its rich diversity of dialects, remains significantly\nunderrepresented in Large Language Models, particularly in dialectal\nvariations. We address this gap by introducing seven synthetic datasets in\ndialects alongside Modern Standard Arabic (MSA), created using Machine\nTranslation (MT) combined with human post-editing. We present AraDiCE, a\nbenchmark for Arabic Dialect and Cultural Evaluation. We evaluate LLMs on\ndialect comprehension and generation, focusing specifically on low-resource\nArabic dialects. Additionally, we introduce the first-ever fine-grained\nbenchmark designed to evaluate cultural awareness across the Gulf, Egypt, and\nLevant regions, providing a novel dimension to LLM evaluation. Our findings\ndemonstrate that while Arabic-specific models like Jais and AceGPT outperform\nmultilingual models on dialectal tasks, significant challenges persist in\ndialect identification, generation, and translation. This work contributes ~45K\npost-edited samples, a cultural benchmark, and highlights the importance of\ntailored training to improve LLM performance in capturing the nuances of\ndiverse Arabic dialects and cultural contexts. We will release the dialectal\ntranslation models and benchmarks curated in this study.\n","authors":["Basel Mousi","Nadir Durrani","Fatema Ahmad","Md. Arid Hasan","Maram Hasanain","Tameem Kabbani","Fahim Dalvi","Shammur Absar Chowdhury","Firoj Alam"],"pdf_url":"https://arxiv.org/pdf/2409.11404v1.pdf","comment":"Benchmarking, Culturally Informed, Large Language Models, Arabic NLP,\n LLMs"},{"id":"http://arxiv.org/abs/2409.11402v1","updated":"2024-09-17T17:59:06Z","published":"2024-09-17T17:59:06Z","title":"NVLM: Open Frontier-Class Multimodal LLMs","summary":" We introduce NVLM 1.0, a family of frontier-class multimodal large language\nmodels (LLMs) that achieve state-of-the-art results on vision-language tasks,\nrivaling the leading proprietary models (e.g., GPT-4o) and open-access models\n(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved\ntext-only performance over its LLM backbone after multimodal training. In terms\nof model design, we perform a comprehensive comparison between decoder-only\nmultimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g.,\nFlamingo). Based on the strengths and weaknesses of both approaches, we propose\na novel architecture that enhances both training efficiency and multimodal\nreasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for\ntile-based dynamic high-resolution images, which significantly boosts\nperformance on multimodal reasoning and OCR-related tasks. Regarding training\ndata, we meticulously curate and provide detailed information on our multimodal\npretraining and supervised fine-tuning datasets. Our findings indicate that\ndataset quality and task diversity are more important than scale, even during\nthe pretraining phase, across all architectures. Notably, we develop\nproduction-grade multimodality for the NVLM-1.0 models, enabling them to excel\nin vision-language tasks while maintaining and even improving text-only\nperformance compared to their LLM backbones. To achieve this, we craft and\nintegrate a high-quality text-only dataset into multimodal training, alongside\na substantial amount of multimodal math and reasoning data, leading to enhanced\nmath and coding capabilities across modalities. To advance research in the\nfield, we are releasing the model weights and will open-source the code for the\ncommunity: https://nvlm-project.github.io/.\n","authors":["Wenliang Dai","Nayeon Lee","Boxin Wang","Zhuoling Yang","Zihan Liu","Jon Barker","Tuomas Rintamaki","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2409.11402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11390v1","updated":"2024-09-17T17:50:15Z","published":"2024-09-17T17:50:15Z","title":"Says Who? Effective Zero-Shot Annotation of Focalization","summary":" Focalization, the perspective through which narrative is presented, is\nencoded via a wide range of lexico-grammatical features and is subject to\nreader interpretation. Moreover, trained readers regularly disagree on\ninterpretations, suggesting that this problem may be computationally\nintractable. In this paper, we provide experiments to test how well\ncontemporary Large Language Models (LLMs) perform when annotating literary\ntexts for focalization mode. Despite the challenging nature of the task, LLMs\nshow comparable performance to trained human annotators in our experiments. We\nprovide a case study working with the novels of Stephen King to demonstrate the\nusefulness of this approach for computational literary studies, illustrating\nhow focalization can be studied at scale.\n","authors":["Rebecca M. M. Hicke","Yuri Bizzoni","Pascale Feldkamp","Ross Deans Kristensen-McLachlan"],"pdf_url":"https://arxiv.org/pdf/2409.11390v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.06173v2","updated":"2024-09-17T17:42:26Z","published":"2024-09-10T03:06:17Z","title":"Larger Language Models Don't Care How You Think: Why Chain-of-Thought\n Prompting Fails in Subjective Tasks","summary":" In-Context Learning (ICL) in Large Language Models (LLM) has emerged as the\ndominant technique for performing natural language tasks, as it does not\nrequire updating the model parameters with gradient-based methods. ICL promises\nto \"adapt\" the LLM to perform the present task at a competitive or\nstate-of-the-art level at a fraction of the computational cost. ICL can be\naugmented by incorporating the reasoning process to arrive at the final label\nexplicitly in the prompt, a technique called Chain-of-Thought (CoT) prompting.\nHowever, recent work has found that ICL relies mostly on the retrieval of task\npriors and less so on \"learning\" to perform tasks, especially for complex\nsubjective domains like emotion and morality, where priors ossify posterior\npredictions. In this work, we examine whether \"enabling\" reasoning also creates\nthe same behavior in LLMs, wherein the format of CoT retrieves reasoning priors\nthat remain relatively unchanged despite the evidence in the prompt. We find\nthat, surprisingly, CoT indeed suffers from the same posterior collapse as ICL\nfor larger language models. Code is avalaible at\nhttps://github.com/gchochla/cot-priors.\n","authors":["Georgios Chochlakis","Niyantha Maruthu Pandiyan","Kristina Lerman","Shrikanth Narayanan"],"pdf_url":"https://arxiv.org/pdf/2409.06173v2.pdf","comment":"5 pages, 2 figures, 1 table. arXiv admin note: text overlap with\n arXiv:2403.17125"},{"id":"http://arxiv.org/abs/2403.12958v2","updated":"2024-09-17T17:25:40Z","published":"2024-03-19T17:57:58Z","title":"Dated Data: Tracing Knowledge Cutoffs in Large Language Models","summary":" Released Large Language Models (LLMs) are often paired with a claimed\nknowledge cutoff date, or the dates at which training data was gathered. Such\ninformation is crucial for applications where the LLM must provide up to date\ninformation. However, this statement only scratches the surface: do all\nresources in the training data share the same knowledge cutoff date? Does the\nmodel's demonstrated knowledge for these subsets closely align to their cutoff\ndates? In this work, we define the notion of an effective cutoff. This is\ndistinct from the LLM designer reported cutoff and applies separately to\nsub-resources and topics. We propose a simple approach to estimate effective\ncutoffs on the resource-level temporal alignment of an LLM by probing across\nversions of the data. Using this analysis, we find that effective cutoffs often\ndiffer from reported cutoffs. To understand the root cause of this observation,\nwe conduct a direct large-scale analysis on open pre-training datasets. Our\nanalysis reveals two reasons for these inconsistencies: (1) temporal biases of\nCommonCrawl data due to non-trivial amounts of old data in new dumps and (2)\ncomplications in LLM deduplication schemes involving semantic duplicates and\nlexical near-duplicates. Overall, our results show that knowledge cutoffs are\nnot as simple as they have seemed and that care must be taken both by LLM\ndataset curators as well as practitioners who seek to use information from\nthese models.\n","authors":["Jeffrey Cheng","Marc Marone","Orion Weller","Dawn Lawrie","Daniel Khashabi","Benjamin Van Durme"],"pdf_url":"https://arxiv.org/pdf/2403.12958v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11378v1","updated":"2024-09-17T17:25:31Z","published":"2024-09-17T17:25:31Z","title":"Diversify and Conquer: Diversity-Centric Data Selection with Iterative\n Refinement","summary":" Finetuning large language models on instruction data is crucial for enhancing\npre-trained knowledge and improving instruction-following capabilities. As\ninstruction datasets proliferate, selecting optimal data for effective training\nbecomes increasingly important. This work addresses the question: How can we\ndetermine the optimal subset of data for effective training? While existing\nresearch often emphasizes local criteria like instance quality for subset\nselection, we argue that a global approach focused on data diversity is more\ncritical. Our method employs k-means clustering to ensure the selected subset\neffectively represents the full dataset. We propose an iterative refinement\nmethod inspired by active learning techniques to resample instances from\nclusters, reassessing each cluster's importance and sampling weight in every\ntraining iteration. This approach reduces the effect of outliers and\nautomatically filters out clusters containing low-quality data. Through\nextensive evaluation across natural language reasoning, general world\nknowledge, code and math reasoning tasks, and by fine-tuning models from\nvarious families, we observe consistent improvements, achieving a 7% increase\nover random selection and a 3.8% improvement over state-of-the-art sampling\nmethods. Our work highlights the significance of diversity-first sampling when\nfinetuning LLMs to enhance performance across a broad array of evaluation\ntasks. Our code is available at\nhttps://github.com/for-ai/iterative-data-selection.\n","authors":["Simon Yu","Liangyu Chen","Sara Ahmadian","Marzieh Fadaee"],"pdf_url":"https://arxiv.org/pdf/2409.11378v1.pdf","comment":"21 pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.11365v1","updated":"2024-09-17T17:14:41Z","published":"2024-09-17T17:14:41Z","title":"CoCA: Regaining Safety-awareness of Multimodal Large Language Models\n with Constitutional Calibration","summary":" The deployment of multimodal large language models (MLLMs) has demonstrated\nremarkable success in engaging in conversations involving visual inputs, thanks\nto the superior power of large language models (LLMs). Those MLLMs are\ntypically built based on the LLMs, with an image encoder to process images into\nthe token embedding space of the LLMs. However, the integration of visual\nmodality has introduced a unique vulnerability: the MLLM becomes susceptible to\nmalicious visual inputs and prone to generating sensitive or harmful responses,\neven though the LLM has been trained on textual dataset to align with human\nvalue. In this paper, we first raise the question: ``Do the MLLMs possess\nsafety-awareness against malicious image inputs?\". We find that after adding a\nprinciple that specifies the safety requirement into the input of the MLLM, the\nmodel's safety awareness becomes boosted. This phenomenon verifies the\nexistence of MLLM's safety-awareness against image inputs, it is only weakened\nby the modality gap. We then introduce a simple yet effective technique termed\nCoCA, which amplifies the safety-awareness of the MLLM by calibrating its\noutput distribution. Our proposed strategy helps the model reclaim its original\nsafety awareness without losing its original capabilities. We verify the\neffectiveness of our approach on both multimodal safety and understanding\nbenchmarks.\n","authors":["Jiahui Gao","Renjie Pi","Tianyang Han","Han Wu","Lanqing Hong","Lingpeng Kong","Xin Jiang","Zhenguo Li"],"pdf_url":"https://arxiv.org/pdf/2409.11365v1.pdf","comment":"10 pages, COLM-2024"},{"id":"http://arxiv.org/abs/2409.11363v1","updated":"2024-09-17T17:13:19Z","published":"2024-09-17T17:13:19Z","title":"CORE-Bench: Fostering the Credibility of Published Research Through a\n Computational Reproducibility Agent Benchmark","summary":" AI agents have the potential to aid users on a variety of consequential\ntasks, including conducting scientific research. To spur the development of\nuseful agents, we need benchmarks that are challenging, but more crucially,\ndirectly correspond to real-world tasks of interest. This paper introduces such\na benchmark, designed to measure the accuracy of AI agents in tackling a\ncrucial yet surprisingly challenging aspect of scientific research:\ncomputational reproducibility. This task, fundamental to the scientific\nprocess, involves reproducing the results of a study using the provided code\nand data. We introduce CORE-Bench (Computational Reproducibility Agent\nBenchmark), a benchmark consisting of 270 tasks based on 90 scientific papers\nacross three disciplines (computer science, social science, and medicine).\nTasks in CORE-Bench consist of three difficulty levels and include both\nlanguage-only and vision-language tasks. We provide an evaluation system to\nmeasure the accuracy of agents in a fast and parallelizable way, saving days of\nevaluation time for each run compared to a sequential implementation. We\nevaluated two baseline agents: the general-purpose AutoGPT and a task-specific\nagent called CORE-Agent. We tested both variants using two underlying language\nmodels: GPT-4o and GPT-4o-mini. The best agent achieved an accuracy of 21% on\nthe hardest task, showing the vast scope for improvement in automating routine\nscientific tasks. Having agents that can reproduce existing work is a necessary\nstep towards building agents that can conduct novel research and could verify\nand improve the performance of other research agents. We hope that CORE-Bench\ncan improve the state of reproducibility and spur the development of future\nresearch agents.\n","authors":["Zachary S. Siegel","Sayash Kapoor","Nitya Nagdir","Benedikt Stroebl","Arvind Narayanan"],"pdf_url":"https://arxiv.org/pdf/2409.11363v1.pdf","comment":"Benchmark harness and code available at\n http://github.com/siegelz/core-bench"},{"id":"http://arxiv.org/abs/2409.11353v1","updated":"2024-09-17T16:55:25Z","published":"2024-09-17T16:55:25Z","title":"THaMES: An End-to-End Tool for Hallucination Mitigation and Evaluation\n in Large Language Models","summary":" Hallucination, the generation of factually incorrect content, is a growing\nchallenge in Large Language Models (LLMs). Existing detection and mitigation\nmethods are often isolated and insufficient for domain-specific needs, lacking\na standardized pipeline. This paper introduces THaMES (Tool for Hallucination\nMitigations and EvaluationS), an integrated framework and library addressing\nthis gap. THaMES offers an end-to-end solution for evaluating and mitigating\nhallucinations in LLMs, featuring automated test set generation, multifaceted\nbenchmarking, and adaptable mitigation strategies. It automates test set\ncreation from any corpus, ensuring high data quality, diversity, and\ncost-efficiency through techniques like batch processing, weighted sampling,\nand counterfactual validation. THaMES assesses a model's ability to detect and\nreduce hallucinations across various tasks, including text generation and\nbinary classification, applying optimal mitigation strategies like In-Context\nLearning (ICL), Retrieval Augmented Generation (RAG), and Parameter-Efficient\nFine-tuning (PEFT). Evaluations of state-of-the-art LLMs using a knowledge base\nof academic papers, political news, and Wikipedia reveal that commercial models\nlike GPT-4o benefit more from RAG than ICL, while open-weight models like\nLlama-3.1-8B-Instruct and Mistral-Nemo gain more from ICL. Additionally, PEFT\nsignificantly enhances the performance of Llama-3.1-8B-Instruct in both\nevaluation tasks.\n","authors":["Mengfei Liang","Archish Arun","Zekun Wu","Cristian Munoz","Jonathan Lutch","Emre Kazim","Adriano Koshiyama","Philip Treleaven"],"pdf_url":"https://arxiv.org/pdf/2409.11353v1.pdf","comment":"Submitted to NeurIPS 2024 SoLaR (Socially Responsible Language\n Modelling Research ) Workshop"},{"id":"http://arxiv.org/abs/2406.11423v2","updated":"2024-09-17T16:20:53Z","published":"2024-06-17T11:22:04Z","title":"Bridging Social Media and Search Engines: Dredge Words and the Detection\n of Unreliable Domains","summary":" Proactive content moderation requires platforms to rapidly and continuously\nevaluate the credibility of websites. Leveraging the direct and indirect paths\nusers follow to unreliable websites, we develop a website credibility\nclassification and discovery system that integrates both webgraph and\nlarge-scale social media contexts. We additionally introduce the concept of\ndredge words, terms or phrases for which unreliable domains rank highly on\nsearch engines, and provide the first exploration of their usage on social\nmedia. Our graph neural networks that combine webgraph and social media\ncontexts generate to state-of-the-art results in website credibility\nclassification and significantly improves the top-k identification of\nunreliable domains. Additionally, we release a novel dataset of dredge words,\nhighlighting their strong connections to both social media and online commerce\nplatforms.\n","authors":["Evan M. Williams","Peter Carragher","Kathleen M. Carley"],"pdf_url":"https://arxiv.org/pdf/2406.11423v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11308v1","updated":"2024-09-17T16:05:09Z","published":"2024-09-17T16:05:09Z","title":"SpMis: An Investigation of Synthetic Spoken Misinformation Detection","summary":" In recent years, speech generation technology has advanced rapidly, fueled by\ngenerative models and large-scale training techniques. While these developments\nhave enabled the production of high-quality synthetic speech, they have also\nraised concerns about the misuse of this technology, particularly for\ngenerating synthetic misinformation. Current research primarily focuses on\ndistinguishing machine-generated speech from human-produced speech, but the\nmore urgent challenge is detecting misinformation within spoken content. This\ntask requires a thorough analysis of factors such as speaker identity, topic,\nand synthesis. To address this need, we conduct an initial investigation into\nsynthetic spoken misinformation detection by introducing an open-source\ndataset, SpMis. SpMis includes speech synthesized from over 1,000 speakers\nacross five common topics, utilizing state-of-the-art text-to-speech systems.\nAlthough our results show promising detection capabilities, they also reveal\nsubstantial challenges for practical implementation, underscoring the\nimportance of ongoing research in this critical area.\n","authors":["Peizhuo Liu","Li Wang","Renqiang He","Haorui He","Lei Wang","Huadi Zheng","Jie Shi","Tong Xiao","Zhizheng Wu"],"pdf_url":"https://arxiv.org/pdf/2409.11308v1.pdf","comment":"Accepted in SLT 2024"},{"id":"http://arxiv.org/abs/2409.11295v1","updated":"2024-09-17T15:49:44Z","published":"2024-09-17T15:49:44Z","title":"EIA: Environmental Injection Attack on Generalist Web Agents for Privacy\n Leakage","summary":" Generalist web agents have evolved rapidly and demonstrated remarkable\npotential. However, there are unprecedented safety risks associated with these\nthem, which are nearly unexplored so far. In this work, we aim to narrow this\ngap by conducting the first study on the privacy risks of generalist web agents\nin adversarial environments. First, we present a threat model that discusses\nthe adversarial targets, constraints, and attack scenarios. Particularly, we\nconsider two types of adversarial targets: stealing users' specific personally\nidentifiable information (PII) or stealing the entire user request. To achieve\nthese objectives, we propose a novel attack method, termed Environmental\nInjection Attack (EIA). This attack injects malicious content designed to adapt\nwell to different environments where the agents operate, causing them to\nperform unintended actions. This work instantiates EIA specifically for the\nprivacy scenario. It inserts malicious web elements alongside persuasive\ninstructions that mislead web agents into leaking private information, and can\nfurther leverage CSS and JavaScript features to remain stealthy. We collect 177\nactions steps that involve diverse PII categories on realistic websites from\nthe Mind2Web dataset, and conduct extensive experiments using one of the most\ncapable generalist web agent frameworks to date, SeeAct. The results\ndemonstrate that EIA achieves up to 70% ASR in stealing users' specific PII.\nStealing full user requests is more challenging, but a relaxed version of EIA\ncan still achieve 16% ASR. Despite these concerning results, it is important to\nnote that the attack can still be detectable through careful human inspection,\nhighlighting a trade-off between high autonomy and security. This leads to our\ndetailed discussion on the efficacy of EIA under different levels of human\nsupervision as well as implications on defenses for generalist web agents.\n","authors":["Zeyi Liao","Lingbo Mo","Chejian Xu","Mintong Kang","Jiawei Zhang","Chaowei Xiao","Yuan Tian","Bo Li","Huan Sun"],"pdf_url":"https://arxiv.org/pdf/2409.11295v1.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2409.11282v1","updated":"2024-09-17T15:37:56Z","published":"2024-09-17T15:37:56Z","title":"Leveraging Distillation Techniques for Document Understanding: A Case\n Study with FLAN-T5","summary":" The surge of digital documents in various formats, including less\nstandardized documents such as business reports and environmental assessments,\nunderscores the growing importance of Document Understanding. While Large\nLanguage Models (LLMs) have showcased prowess across diverse natural language\nprocessing tasks, their direct application to Document Understanding remains a\nchallenge. Previous research has demonstrated the utility of LLMs in this\ndomain, yet their significant computational demands make them challenging to\ndeploy effectively. Additionally, proprietary Blackbox LLMs often outperform\ntheir open-source counterparts, posing a barrier to widespread accessibility.\nIn this paper, we delve into the realm of document understanding, leveraging\ndistillation methods to harness the power of large LLMs while accommodating\ncomputational limitations. Specifically, we present a novel approach wherein we\ndistill document understanding knowledge from the proprietary LLM ChatGPT into\nFLAN-T5. Our methodology integrates labeling and curriculum-learning mechanisms\nto facilitate efficient knowledge transfer. This work contributes to the\nadvancement of document understanding methodologies by offering a scalable\nsolution that bridges the gap between resource-intensive LLMs and practical\napplications. Our findings underscore the potential of distillation techniques\nin facilitating the deployment of sophisticated language models in real-world\nscenarios, thereby fostering advancements in natural language processing and\ndocument comprehension domains.\n","authors":["Marcel Lamott","Muhammad Armaghan Shakir"],"pdf_url":"https://arxiv.org/pdf/2409.11282v1.pdf","comment":"Presented at AI@WORK-Workshop / Informatik-Festival (GI-Jahrestagung)\n (Wiesbaden, Germany, 2024)"},{"id":"http://arxiv.org/abs/2409.11279v1","updated":"2024-09-17T15:29:34Z","published":"2024-09-17T15:29:34Z","title":"P-RAG: Progressive Retrieval Augmented Generation For Planning on\n Embodied Everyday Task","summary":" Embodied Everyday Task is a popular task in the embodied AI community,\nrequiring agents to make a sequence of actions based on natural language\ninstructions and visual observations. Traditional learning-based approaches\nface two challenges. Firstly, natural language instructions often lack explicit\ntask planning. Secondly, extensive training is required to equip models with\nknowledge of the task environment. Previous works based on Large Language Model\n(LLM) either suffer from poor performance due to the lack of task-specific\nknowledge or rely on ground truth as few-shot samples. To address the above\nlimitations, we propose a novel approach called Progressive Retrieval Augmented\nGeneration (P-RAG), which not only effectively leverages the powerful language\nprocessing capabilities of LLMs but also progressively accumulates\ntask-specific knowledge without ground-truth. Compared to the conventional RAG\nmethods, which retrieve relevant information from the database in a one-shot\nmanner to assist generation, P-RAG introduces an iterative approach to\nprogressively update the database. In each iteration, P-RAG retrieves the\nlatest database and obtains historical information from the previous\ninteraction as experiential references for the current interaction. Moreover,\nwe also introduce a more granular retrieval scheme that not only retrieves\nsimilar tasks but also incorporates retrieval of similar situations to provide\nmore valuable reference experiences. Extensive experiments reveal that P-RAG\nachieves competitive results without utilizing ground truth and can even\nfurther improve performance through self-iterations.\n","authors":["Weiye Xu","Min Wang","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.11279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11274v1","updated":"2024-09-17T15:25:11Z","published":"2024-09-17T15:25:11Z","title":"Task Arithmetic for Language Expansion in Speech Translation","summary":" Recent advances in large language models (LLMs) have gained interest in\nspeech-text multimodal foundation models, achieving strong performance on\ninstruction-based speech translation (ST). However, expanding language pairs\nfrom an existing instruction-tuned ST system is costly due to the necessity of\nre-training on a combination of new and previous datasets. We propose to expand\nnew language pairs by merging the model trained on new language pairs and the\nexisting model, using task arithmetic. We find that the direct application of\ntask arithmetic for ST causes the merged model to fail to follow instructions;\nthus, generating translation in incorrect languages. To eliminate language\nconfusion, we propose an augmented task arithmetic method that merges an\nadditional language control model. It is trained to generate the correct target\nlanguage token following the instructions. Our experiments demonstrate that our\nproposed language control model can achieve language expansion by eliminating\nlanguage confusion. In our MuST-C and CoVoST-2 experiments, it shows up to 4.66\nand 4.92 BLEU scores improvement, respectively. In addition, we demonstrate the\nuse of our task arithmetic framework can expand to a language pair where\nneither paired ST training data nor a pre-trained ST model is available. We\nfirst synthesize the ST system from machine translation (MT) systems via task\nanalogy, then merge the synthesized ST system to the existing ST model.\n","authors":["Yao-Fei Cheng","Hayato Futami","Yosuke Kashiwagi","Emiru Tsunoo","Wen Shen Teo","Siddhant Arora","Shinji Watanabe"],"pdf_url":"https://arxiv.org/pdf/2409.11274v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.04847v2","updated":"2024-09-17T15:17:36Z","published":"2024-06-07T11:21:52Z","title":"Do Language Models Exhibit Human-like Structural Priming Effects?","summary":" We explore which linguistic factors -- at the sentence and token level --\nplay an important role in influencing language model predictions, and\ninvestigate whether these are reflective of results found in humans and human\ncorpora (Gries and Kootstra, 2017). We make use of the structural priming\nparadigm, where recent exposure to a structure facilitates processing of the\nsame structure. We don't only investigate whether, but also where priming\neffects occur, and what factors predict them. We show that these effects can be\nexplained via the inverse frequency effect, known in human priming, where rarer\nelements within a prime increase priming effects, as well as lexical dependence\nbetween prime and target. Our results provide an important piece in the puzzle\nof understanding how properties within their context affect structural\nprediction in language models.\n","authors":["Jaap Jumelet","Willem Zuidema","Arabella Sinclair"],"pdf_url":"https://arxiv.org/pdf/2406.04847v2.pdf","comment":"ACL Findings 2024"},{"id":"http://arxiv.org/abs/2409.11263v1","updated":"2024-09-17T15:11:39Z","published":"2024-09-17T15:11:39Z","title":"Bio-Inspired Mamba: Temporal Locality and Bioplausible Learning in\n Selective State Space Models","summary":" This paper introduces Bio-Inspired Mamba (BIM), a novel online learning\nframework for selective state space models that integrates biological learning\nprinciples with the Mamba architecture. BIM combines Real-Time Recurrent\nLearning (RTRL) with Spike-Timing-Dependent Plasticity (STDP)-like local\nlearning rules, addressing the challenges of temporal locality and biological\nplausibility in training spiking neural networks. Our approach leverages the\ninherent connection between backpropagation through time and STDP, offering a\ncomputationally efficient alternative that maintains the ability to capture\nlong-range dependencies. We evaluate BIM on language modeling, speech\nrecognition, and biomedical signal analysis tasks, demonstrating competitive\nperformance against traditional methods while adhering to biological learning\nprinciples. Results show improved energy efficiency and potential for\nneuromorphic hardware implementation. BIM not only advances the field of\nbiologically plausible machine learning but also provides insights into the\nmechanisms of temporal information processing in biological neural networks.\n","authors":["Jiahao Qin"],"pdf_url":"https://arxiv.org/pdf/2409.11263v1.pdf","comment":"17 pages, 1 figure, 2 tables"},{"id":"http://arxiv.org/abs/2409.11253v1","updated":"2024-09-17T15:02:23Z","published":"2024-09-17T15:02:23Z","title":"Norm of Mean Contextualized Embeddings Determines their Variance","summary":" Contextualized embeddings vary by context, even for the same token, and form\na distribution in the embedding space. To analyze this distribution, we focus\non the norm of the mean embedding and the variance of the embeddings. In this\nstudy, we first demonstrate that these values follow the well-known formula for\nvariance in statistics and provide an efficient sequential computation method.\nThen, by observing embeddings from intermediate layers of several Transformer\nmodels, we found a strong trade-off relationship between the norm and the\nvariance: as the mean embedding becomes closer to the origin, the variance\nincreases. This trade-off is likely influenced by the layer normalization\nmechanism used in Transformer models. Furthermore, when the sets of token\nembeddings are treated as clusters, we show that the variance of the entire\nembedding set can theoretically be decomposed into the within-cluster variance\nand the between-cluster variance. We found experimentally that as the layers of\nTransformer models deepen, the embeddings move farther from the origin, the\nbetween-cluster variance relatively decreases, and the within-cluster variance\nrelatively increases. These results are consistent with existing studies on the\nanisotropy of the embedding spaces across layers.\n","authors":["Hiroaki Yamagiwa","Hidetoshi Shimodaira"],"pdf_url":"https://arxiv.org/pdf/2409.11253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11252v1","updated":"2024-09-17T15:00:31Z","published":"2024-09-17T15:00:31Z","title":"WER We Stand: Benchmarking Urdu ASR Models","summary":" This paper presents a comprehensive evaluation of Urdu Automatic Speech\nRecognition (ASR) models. We analyze the performance of three ASR model\nfamilies: Whisper, MMS, and Seamless-M4T using Word Error Rate (WER), along\nwith a detailed examination of the most frequent wrong words and error types\nincluding insertions, deletions, and substitutions. Our analysis is conducted\nusing two types of datasets, read speech and conversational speech. Notably, we\npresent the first conversational speech dataset designed for benchmarking Urdu\nASR models. We find that seamless-large outperforms other ASR models on the\nread speech dataset, while whisper-large performs best on the conversational\nspeech dataset. Furthermore, this evaluation highlights the complexities of\nassessing ASR models for low-resource languages like Urdu using quantitative\nmetrics alone and emphasizes the need for a robust Urdu text normalization\nsystem. Our findings contribute valuable insights for developing robust ASR\nsystems for low-resource languages like Urdu.\n","authors":["Samee Arif","Aamina Jamal Khan","Mustafa Abbas","Agha Ali Raza","Awais Athar"],"pdf_url":"https://arxiv.org/pdf/2409.11252v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11250v1","updated":"2024-09-17T14:57:51Z","published":"2024-09-17T14:57:51Z","title":"Linear Recency Bias During Training Improves Transformers' Fit to\n Reading Times","summary":" Recent psycholinguistic research has compared human reading times to\nsurprisal estimates from language models to study the factors shaping human\nsentence processing difficulty. Previous studies have shown a strong fit\nbetween surprisal values from Transformers and reading times. However, standard\nTransformers work with a lossless representation of the entire previous\nlinguistic context, unlike models of human language processing that include\nmemory decay. To bridge this gap, this paper evaluates a modification of the\nTransformer model that uses ALiBi (Press et al., 2022), a recency bias added to\nattention scores. Surprisal estimates with ALiBi show an improved fit to human\nreading times compared to a standard Transformer baseline. A subsequent\nanalysis of attention heads suggests that ALiBi's mixture of slopes -- which\ndetermine the rate of memory decay in each attention head -- may play a role in\nthe improvement by helping models with ALiBi to track different kinds of\nlinguistic dependencies.\n","authors":["Christian Clark","Byung-Doh Oh","William Schuler"],"pdf_url":"https://arxiv.org/pdf/2409.11250v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11242v1","updated":"2024-09-17T14:47:33Z","published":"2024-09-17T14:47:33Z","title":"Measuring and Enhancing Trustworthiness of LLMs in RAG through Grounded\n Attributions and Learning to Refuse","summary":" LLMs are an integral part of retrieval-augmented generation (RAG) systems.\nWhile many studies focus on evaluating the quality of end-to-end RAG systems,\nthere is a lack of research on understanding the appropriateness of an LLM for\nthe RAG task. Thus, we introduce a new metric, Trust-Score, that provides a\nholistic evaluation of the trustworthiness of LLMs in an RAG framework. We show\nthat various prompting methods, such as in-context learning, fail to adapt LLMs\neffectively to the RAG task. Thus, we propose Trust-Align, a framework to align\nLLMs for higher Trust-Score. LLaMA-3-8b, aligned with our method, significantly\noutperforms open-source LLMs of comparable sizes on ASQA (up 10.7), QAMPARI (up\n29.2) and ELI5 (up 14.9). We release our code at:\nhttps://github.com/declare-lab/trust-align.\n","authors":["Maojia Song","Shang Hong Sim","Rishabh Bhardwaj","Hai Leong Chieu","Navonil Majumder","Soujanya Poria"],"pdf_url":"https://arxiv.org/pdf/2409.11242v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.09662v2","updated":"2024-09-17T14:44:34Z","published":"2024-09-15T08:25:24Z","title":"ExploreSelf: Fostering User-driven Exploration and Reflection on\n Personal Challenges with Adaptive Guidance by Large Language Models","summary":" Expressing stressful experiences in words is proven to improve mental and\nphysical health, but individuals often disengage with writing interventions as\nthey struggle to organize their thoughts and emotions. Reflective prompts have\nbeen used to provide direction, and large language models (LLMs) have\ndemonstrated the potential to provide tailored guidance. Current systems often\nlimit users' flexibility to direct their reflections. We thus present\nExploreSelf, an LLM-driven application designed to empower users to control\ntheir reflective journey. ExploreSelf allows users to receive adaptive support\nthrough dynamically generated questions. Through an exploratory study with 19\nparticipants, we examine how participants explore and reflect on personal\nchallenges using ExploreSelf. Our findings demonstrate that participants valued\nthe balance between guided support and freedom to control their reflective\njourney, leading to deeper engagement and insight. Building on our findings, we\ndiscuss implications for designing LLM-driven tools that promote user\nempowerment through effective reflective practices.\n","authors":["Inhwa Song","SoHyun Park","Sachin R. Pendse","Jessica Lee Schleider","Munmun De Choudhury","Young-Ho Kim"],"pdf_url":"https://arxiv.org/pdf/2409.09662v2.pdf","comment":"17 pages excluding reference and appendix"},{"id":"http://arxiv.org/abs/2409.11241v1","updated":"2024-09-17T14:43:14Z","published":"2024-09-17T14:43:14Z","title":"Spontaneous Informal Speech Dataset for Punctuation Restoration","summary":" Presently, punctuation restoration models are evaluated almost solely on\nwell-structured, scripted corpora. On the other hand, real-world ASR systems\nand post-processing pipelines typically apply towards spontaneous speech with\nsignificant irregularities, stutters, and deviations from perfect grammar. To\naddress this discrepancy, we introduce SponSpeech, a punctuation restoration\ndataset derived from informal speech sources, which includes punctuation and\ncasing information. In addition to publicly releasing the dataset, we\ncontribute a filtering pipeline that can be used to generate more data. Our\nfiltering pipeline examines the quality of both speech audio and transcription\ntext. We also carefully construct a ``challenging\" test set, aimed at\nevaluating models' ability to leverage audio information to predict otherwise\ngrammatically ambiguous punctuation. SponSpeech is available at\nhttps://github.com/GitHubAccountAnonymous/PR, along with all code for dataset\nbuilding and model runs.\n","authors":["Xing Yi Liu","Homayoon Beigi"],"pdf_url":"https://arxiv.org/pdf/2409.11241v1.pdf","comment":"8 pages, 7 tables, 1 figure, Recognition Technologies, Inc. Technical\n Report"},{"id":"http://arxiv.org/abs/2409.11239v1","updated":"2024-09-17T14:40:02Z","published":"2024-09-17T14:40:02Z","title":"LLM-as-a-Judge & Reward Model: What They Can and Cannot Do","summary":" LLM-as-a-Judge and reward models are widely used alternatives of\nmultiple-choice questions or human annotators for large language model (LLM)\nevaluation. Their efficacy shines in evaluating long-form responses, serving a\ncritical role as evaluators of leaderboards and as proxies to align LLMs via\nreinforcement learning. However, despite their popularity, their effectiveness\noutside of English remains largely unexplored. In this paper, we conduct a\ncomprehensive analysis on automated evaluators, reporting key findings on their\nbehavior in a non-English environment. First, we discover that English\nevaluation capabilities significantly influence language-specific capabilities,\noften more than the language proficiency itself, enabling evaluators trained in\nEnglish to easily transfer their skills to other languages. Second, we identify\ncritical shortcomings, where LLMs fail to detect and penalize errors, such as\nfactual inaccuracies, cultural misrepresentations, and the presence of unwanted\nlanguage. Finally, we release Kudge, the first non-English meta-evaluation\ndataset containing 5,012 human annotations in Korean.\n","authors":["Guijin Son","Hyunwoo Ko","Hoyoung Lee","Yewon Kim","Seunghyeok Hong"],"pdf_url":"https://arxiv.org/pdf/2409.11239v1.pdf","comment":"preprint"},{"id":"http://arxiv.org/abs/2409.11233v1","updated":"2024-09-17T14:34:11Z","published":"2024-09-17T14:34:11Z","title":"Evaluating the Impact of Compression Techniques on Task-Specific\n Performance of Large Language Models","summary":" Large language models (LLMs) offer powerful capabilities but incur\nsubstantial computational costs, driving the need for efficient compression\ntechniques. This study evaluates the impact of popular compression methods -\nMagnitude Pruning, SparseGPT, and Wanda - on the LLaMA-2-7B model, focusing on\nthe trade-offs between model size reduction, downstream task performance, and\nthe role of calibration data. Our findings reveal that while SparseGPT and\nWanda preserve perplexity even at 50% sparsity, they suffer significant\ndegradation on downstream tasks, highlighting the inadequacy of perplexity as\nthe sole evaluation metric. To address this, we introduce Jensen-Shannon (JS)\nDivergence as a more comprehensive metric that captures nuanced changes in\nmodel behavior post-compression. We further demonstrate that task-specific\ncalibration data significantly enhances the downstream performance of\ncompressed models compared to general calibration data. This research\nunderscores the necessity for diverse evaluation metrics and careful\ncalibration data selection to fully understand the complexities of LLM\ncompression and its implications for practical applications.\n","authors":["Bishwash Khanal","Jeffery M. Capone"],"pdf_url":"https://arxiv.org/pdf/2409.11233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2406.19071v2","updated":"2024-09-17T14:24:47Z","published":"2024-06-27T10:41:22Z","title":"EmPO: Emotion Grounding for Empathetic Response Generation through\n Preference Optimization","summary":" Empathetic response generation is a desirable aspect of conversational\nagents, crucial for facilitating engaging and emotionally intelligent\nmulti-turn conversations between humans and machines. Leveraging large language\nmodels for this task has shown promising results, yet challenges persist in\nensuring both the empathetic quality of the responses and retention of the\ngeneralization performance of the models. We propose a novel approach where we\nconstruct theory-driven preference datasets based on emotion grounding and use\nthem to align LLMs with preference optimization algorithms to address these\nchallenges. To evaluate empathetic response generation, we employ the\nEmpatheticDialogues dataset, assessing empathy with the diff-Epitome and\nBERTscore metrics and with multi-dimensional human evaluation. Additionally, we\nmeasure diversity and emotional valence using feature-based methods. We also\nevaluate the impact of training on the generalization performance using the\nMMLU benchmark and tasks from the Open LLM Leaderboard. The results show that\nLLMs can be aligned for empathetic response generation by preference\noptimization while retaining their general performance and that emotion\ngrounding can guide preference dataset creation. We make all datasets, source\ncode, and models publicly available. https://github.com/justtherightsize/empo\n","authors":["Ondrej Sotolar","Vojtech Formanek","Alok Debnath","Allison Lahnala","Charles Welch","Lucie FLek"],"pdf_url":"https://arxiv.org/pdf/2406.19071v2.pdf","comment":"v02, 8 pages long paper, EMNLP ACL style"},{"id":"http://arxiv.org/abs/2406.12419v2","updated":"2024-09-17T14:18:11Z","published":"2024-06-18T09:12:11Z","title":"AI-Assisted Human Evaluation of Machine Translation","summary":" Annually, research teams spend large amounts of money to evaluate the quality\nof machine translation systems (WMT, inter alia). This is expensive because it\nrequires a lot of expert human labor. The recently adopted annotation protocol,\nError Span Annotation (ESA), has annotators marking erroneous parts of the\ntranslation and then assigning a final score. A lot of the annotator time is\nspent on scanning the translation for possible errors. In our work, we help the\nannotators by pre-filling the error annotations with recall-oriented automatic\nquality estimation. With this AI assistance, we obtain annotations at the same\nquality level while cutting down the time per span annotation by half\n(71s/error span $\\rightarrow$ 31s/error span). The biggest advantage of\nESA$^\\mathrm{AI}$ protocol is an accurate priming of annotators (pre-filled\nerror spans) before they assign the final score. This also alleviates a\npotential automation bias, which we confirm to be low. In addition, the\nannotation budget can be reduced by almost 25\\% with filtering of examples that\nthe AI deems to be very likely to be correct.\n","authors":["Vilém Zouhar","Tom Kocmi","Mrinmaya Sachan"],"pdf_url":"https://arxiv.org/pdf/2406.12419v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11218v1","updated":"2024-09-17T14:12:08Z","published":"2024-09-17T14:12:08Z","title":"Exploring ChatGPT-based Augmentation Strategies for Contrastive\n Aspect-based Sentiment Analysis","summary":" Aspect-based sentiment analysis (ABSA) involves identifying sentiment towards\nspecific aspect terms in a sentence and allows us to uncover nuanced\nperspectives and attitudes on particular aspects of a product, service, or\ntopic. However, the scarcity of labeled data poses a significant challenge to\ntraining high-quality models. To address this issue, we explore the potential\nof data augmentation using ChatGPT, a well-performing large language model\n(LLM), to enhance the sentiment classification performance towards aspect\nterms. Specifically, we explore three data augmentation strategies based on\nChatGPT: context-focused, aspect-focused, and context-aspect data augmentation\ntechniques. Context-focused data augmentation focuses on changing the word\nexpression of context words in the sentence while keeping aspect terms\nunchanged. In contrast, aspect-focused data augmentation aims to change aspect\nterms but keep context words unchanged. Context-Aspect data augmentation\nintegrates the above two data augmentations to generate augmented samples.\nFurthermore, we incorporate contrastive learning into the ABSA tasks to improve\nperformance. Extensive experiments show that all three data augmentation\ntechniques lead to performance improvements, with the context-aspect data\naugmentation strategy performing best and surpassing the performance of the\nbaseline models.\n","authors":["Lingling Xu","Haoran Xie","S. Joe Qin","Fu Lee Wang","Xiaohui Tao"],"pdf_url":"https://arxiv.org/pdf/2409.11218v1.pdf","comment":"8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2409.11212v1","updated":"2024-09-17T14:05:58Z","published":"2024-09-17T14:05:58Z","title":"Self-Evolutionary Large Language Models through Uncertainty-Enhanced\n Preference Optimization","summary":" Iterative preference optimization has recently become one of the de-facto\ntraining paradigms for large language models (LLMs), but the performance is\nstill underwhelming due to too much noisy preference data yielded in the loop.\nTo combat this issue, we present an \\textbf{U}ncertainty-enhanced\n\\textbf{P}reference \\textbf{O}ptimization (UPO) framework to make the LLM\nself-evolve with reliable feedback. The key idea is mitigating the noisy\npreference data derived from the current policy and reward models by performing\npair-wise uncertainty estimation and judiciously reliable feedback sampling. To\nreach this goal, we thus introduce an estimator model, which incorporates Monte\nCarlo (MC) dropout in Bayesian neural network (BNN) to perform uncertainty\nestimation for the preference data derived from the LLM policy. Compared to the\nexisting methods that directly filter generated responses based on the reward\nscore, the estimator focuses on the model uncertainty in a pair-wise manner and\neffectively bypasses the confirmation bias problem of the reward model.\nAdditionally, we also propose an uncertainty-enhanced self-evolution algorithm\nto improve the robustness of preference optimization and encourage the LLM to\ngenerate responses with both high reward and certainty. Extensive experiments\nover multiple benchmarks demonstrate that our framework substantially\nalleviates the noisy problem and improves the performance of iterative\npreference optimization.\n","authors":["Jianing Wang","Yang Zhou","Xiaocheng Zhang","Mengjiao Bao","Peng Yan"],"pdf_url":"https://arxiv.org/pdf/2409.11212v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2406.11629v4","updated":"2024-09-17T14:04:27Z","published":"2024-06-17T15:11:58Z","title":"Can Many-Shot In-Context Learning Help LLMs as Evaluators? A Preliminary\n Empirical Study","summary":" Utilizing Large Language Models (LLMs) as evaluators for evaluating the\nperformance of LLMs has recently garnered attention. However, this kind of\nevaluation approach is affected by potential biases in LLMs, raising concerns\nabout the accuracy and reliability of the evaluation results. To mitigate this\nissue, we propose and study two many-shot ICL prompts, which rely on two\nversions of many-shot ICL prompt templates for helping LLM evaluators to\nmitigate the potential biases in LLMs, \\textbf{M}any-\\textbf{S}hot\n\\textbf{w}ith \\textbf{R}eference (\\textbf{MSwR}) and\n\\textbf{M}any-\\textbf{S}hot with\\textbf{o}ut \\textbf{R}eference\n(\\textbf{MSoR}). Concretely, the former utilizes in-context examples with\nmodel-generated rationales as guidance, and the latter without. Based on the\ndesigned prompts, we investigate the impact of scaling the number of in-context\nexamples on the consistency and quality of the evaluation results. Experimental\nresults show that advanced LLMs, such as GPT-4o, perform better in the\nmany-shot regime than in the zero-shot regime. Furthermore, we reveal the\nsymbol bias hidden in the selection bias of LLMs and propose a simple yet\neffective approach to mitigate the bias. Experimental results further verify\nthe effectiveness of the symbol bias mitigation approach.\n","authors":["Mingyang Song","Mao Zheng","Xuan Luo"],"pdf_url":"https://arxiv.org/pdf/2406.11629v4.pdf","comment":"work in progress"},{"id":"http://arxiv.org/abs/2406.05804v3","updated":"2024-09-17T14:02:29Z","published":"2024-06-09T14:42:55Z","title":"A Review of Prominent Paradigms for LLM-Based Agents: Tool Use\n (Including RAG), Planning, and Feedback Learning","summary":" Tool use, planning, and feedback learning are currently three prominent\nparadigms for developing Large Language Model (LLM)-based agents across various\ntasks. Although numerous frameworks have been devised for each paradigm, their\nintricate workflows and inconsistent taxonomy create challenges in\nunderstanding and reviewing the frameworks across different paradigms. This\nsurvey introduces a unified taxonomy to systematically review and discuss these\nframeworks. Specifically, 1) the taxonomy defines environments/tasks, common\nLLM-profiled roles (policy models, evaluators, and dynamic models), and\nuniversally applicable workflows found in prior work, and 2) it enables a\ncomparison of key perspectives on LMPR implementations and workflow usage\nacross different agent paradigms.\n","authors":["Xinzhe Li"],"pdf_url":"https://arxiv.org/pdf/2406.05804v3.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2407.09590v2","updated":"2024-09-17T13:48:50Z","published":"2024-07-12T17:25:02Z","title":"Diversifying the Expert Knowledge for Task-Agnostic Pruning in Sparse\n Mixture-of-Experts","summary":" By increasing model parameters but activating them sparsely when performing a\ntask, the use of Mixture-of-Experts (MoE) architecture significantly improves\nthe performance of Large Language Models (LLMs) without increasing the\ninference cost. However, the memory consumption due to the growing number of\nexperts presents a challenge to the deployment of these models in many real\nworld settings. Our empirical study reveals that some experts encode redundant\nknowledge during pre-training. We thus propose a method of grouping and pruning\nsimilar experts to improve the model's parameter efficiency. We validate the\neffectiveness of our method by pruning three state-of-the-art MoE\narchitectures, including Mixtral, Deepseek-MoE, and Qwen. The evaluation shows\nthat our method outperforms other model pruning methods on a range of natural\nlanguage tasks. We will release our code to facilitate future research.\n","authors":["Zeliang Zhang","Xiaodong Liu","Hao Cheng","Chenliang Xu","Jianfeng Gao"],"pdf_url":"https://arxiv.org/pdf/2407.09590v2.pdf","comment":"13pages, 6 figures"},{"id":"http://arxiv.org/abs/2409.11170v1","updated":"2024-09-17T13:24:29Z","published":"2024-09-17T13:24:29Z","title":"Capturing Differences in Character Representations Between Communities:\n An Initial Study with Fandom","summary":" Sociolinguistic theories have highlighted how narratives are often retold,\nco-constructed and reconceptualized in collaborative settings. This working\npaper focuses on the re-interpretation of characters, an integral part of the\nnarrative story-world, and attempts to study how this may be computationally\ncompared between online communities. Using online fandom - a highly communal\nphenomenon that has been largely studied qualitatively - as data, computational\nmethods were applied to explore shifts in character representations between two\ncommunities and the original text. Specifically, text from the Harry Potter\nnovels, r/HarryPotter subreddit, and fanfiction on Archive of Our Own were\nanalyzed for changes in character mentions, centrality measures from\nco-occurrence networks, and semantic associations. While fandom elevates\nsecondary characters as found in past work, the two fan communities prioritize\ndifferent subsets of characters. Word embedding tests reveal starkly different\nassociations of the same characters between communities on the gendered\nconcepts of femininity/masculinity, cruelty, and beauty. Furthermore,\nfanfiction descriptions of a male character analyzed between romance pairings\nscored higher for feminine-coded characteristics in male-male romance, matching\npast qualitative theorizing. The results high-light the potential for\ncomputational methods to assist in capturing the re-conceptualization of\nnarrative elements across communities and in supporting qualitative research on\nfandom.\n","authors":["Bianca N. Y. Kang"],"pdf_url":"https://arxiv.org/pdf/2409.11170v1.pdf","comment":"Accepted and presented as a working paper in SBP-BRiMS 2024"},{"id":"http://arxiv.org/abs/2409.11149v1","updated":"2024-09-17T13:03:12Z","published":"2024-09-17T13:03:12Z","title":"SAGED: A Holistic Bias-Benchmarking Pipeline for Language Models with\n Customisable Fairness Calibration","summary":" The development of unbiased large language models is widely recognized as\ncrucial, yet existing benchmarks fall short in detecting biases due to limited\nscope, contamination, and lack of a fairness baseline. SAGED(-Bias) is the\nfirst holistic benchmarking pipeline to address these problems. The pipeline\nencompasses five core stages: scraping materials, assembling benchmarks,\ngenerating responses, extracting numeric features, and diagnosing with\ndisparity metrics. SAGED includes metrics for max disparity, such as impact\nratio, and bias concentration, such as Max Z-scores. Noticing that assessment\ntool bias and contextual bias in prompts can distort evaluation, SAGED\nimplements counterfactual branching and baseline calibration for mitigation.\nFor demonstration, we use SAGED on G20 Countries with popular 8b-level models\nincluding Gemma2, Llama3.1, Mistral, and Qwen2. With sentiment analysis, we\nfind that while Mistral and Qwen2 show lower max disparity and higher bias\nconcentration than Gemma2 and Llama3.1, all models are notably biased against\ncountries like Russia and (except for Qwen2) China. With further experiments to\nhave models role-playing U.S. (vice-/former-) presidents, we see bias amplifies\nand shifts in heterogeneous directions. Moreover, we see Qwen2 and Mistral not\nengage in role-playing, while Llama3.1 and Gemma2 role-play Trump notably more\nintensively than Biden and Harris, indicating role-playing performance bias in\nthese models.\n","authors":["Xin Guan","Nathaniel Demchak","Saloni Gupta","Ze Wang","Ediz Ertekin Jr.","Adriano Koshiyama","Emre Kazim","Zekun Wu"],"pdf_url":"https://arxiv.org/pdf/2409.11149v1.pdf","comment":"Submitted to COLING 2025 Main Conference"},{"id":"http://arxiv.org/abs/2409.11148v1","updated":"2024-09-17T13:02:19Z","published":"2024-09-17T13:02:19Z","title":"Improving the Efficiency of Visually Augmented Language Models","summary":" Despite the impressive performance of autoregressive Language Models (LM) it\nhas been shown that due to reporting bias, LMs lack visual knowledge, i.e. they\ndo not know much about the visual world and its properties. To augment LMs with\nvisual knowledge, existing solutions often rely on explicit images, requiring\ntime-consuming retrieval or image generation systems. This paper shows that\nexplicit images are not necessary to visually augment an LM. Instead, we use\nvisually-grounded text representations obtained from the well-known CLIP\nmultimodal system. For a fair comparison, we modify VALM, a visually-augmented\nLM which uses image retrieval and representation, to work directly with\nvisually-grounded text representations. We name this new model BLIND-VALM. We\nshow that BLIND-VALM performs on par with VALM for Visual Language\nUnderstanding (VLU), Natural Language Understanding (NLU) and Language Modeling\ntasks, despite being significantly more efficient and simpler. We also show\nthat scaling up our model within the compute budget of VALM, either increasing\nthe model or pre-training corpus size, we outperform VALM for all the\nevaluation tasks.\n","authors":["Paula Ontalvilla","Aitor Ormazabal","Gorka Azkune"],"pdf_url":"https://arxiv.org/pdf/2409.11148v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11147v1","updated":"2024-09-17T12:58:29Z","published":"2024-09-17T12:58:29Z","title":"Reasoning Graph Enhanced Exemplars Retrieval for In-Context Learning","summary":" Large language models(LLMs) have exhibited remarkable few-shot learning\ncapabilities and unified the paradigm of NLP tasks through the in-context\nlearning(ICL) technique. Despite the success of ICL, the quality of the\nexemplar demonstrations can significantly influence the LLM's performance.\nExisting exemplar selection methods mainly focus on the semantic similarity\nbetween queries and candidate exemplars. On the other hand, the logical\nconnections between reasoning steps can be beneficial to depict the\nproblem-solving process as well. In this paper, we proposes a novel method\nnamed Reasoning Graph-enhanced Exemplar Retrieval(RGER). RGER first quires LLM\nto generate an initial response, then expresses intermediate problem-solving\nsteps to a graph structure. After that, it employs graph kernel to select\nexemplars with semantic and structural similarity. Extensive experiments\ndemonstrate the structural relationship is helpful to the alignment of queries\nand candidate exemplars. The efficacy of RGER on math and logit reasoning tasks\nshowcases its superiority over state-of-the-art retrieval-based approaches. Our\ncode is released at https://github.com/Yukang-Lin/RGER.\n","authors":["Yukang Lin","Bingchen Zhong","Shuoran Jiang","Joanna Siebert","Qingcai Chen"],"pdf_url":"https://arxiv.org/pdf/2409.11147v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11143v1","updated":"2024-09-17T12:54:34Z","published":"2024-09-17T12:54:34Z","title":"Semformer: Transformer Language Models with Semantic Planning","summary":" Next-token prediction serves as the dominant component in current neural\nlanguage models. During the training phase, the model employs teacher forcing,\nwhich predicts tokens based on all preceding ground truth tokens. However, this\napproach has been found to create shortcuts, utilizing the revealed prefix to\nspuriously fit future tokens, potentially compromising the accuracy of the\nnext-token predictor. In this paper, we introduce Semformer, a novel method of\ntraining a Transformer language model that explicitly models the semantic\nplanning of response. Specifically, we incorporate a sequence of planning\ntokens into the prefix, guiding the planning token representations to predict\nthe latent semantic representations of the response, which are induced by an\nautoencoder. In a minimal planning task (i.e., graph path-finding), our model\nexhibits near-perfect performance and effectively mitigates shortcut learning,\na feat that standard training methods and baseline models have been unable to\naccomplish. Furthermore, we pretrain Semformer from scratch with 125M\nparameters, demonstrating its efficacy through measures of perplexity,\nin-context learning, and fine-tuning on summarization tasks.\n","authors":["Yongjing Yin","Junran Ding","Kai Song","Yue Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.11143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11136v1","updated":"2024-09-17T12:42:55Z","published":"2024-09-17T12:42:55Z","title":"Promptriever: Instruction-Trained Retrievers Can Be Prompted Like\n Language Models","summary":" Instruction-tuned language models (LM) are able to respond to imperative\ncommands, providing a more natural user interface compared to their base\ncounterparts. In this work, we present Promptriever, the first retrieval model\nable to be prompted like an LM. To train Promptriever, we curate and release a\nnew instance-level instruction training set from MS MARCO, spanning nearly 500k\ninstances. Promptriever not only achieves strong performance on standard\nretrieval tasks, but also follows instructions. We observe: (1) large gains\n(reaching SoTA) on following detailed relevance instructions (+14.3 p-MRR /\n+3.1 nDCG on FollowIR), (2) significantly increased robustness to lexical\nchoices/phrasing in the query+instruction (+12.9 Robustness@10 on InstructIR),\nand (3) the ability to perform hyperparameter search via prompting to reliably\nimprove retrieval performance (+1.4 average increase on BEIR). Promptriever\ndemonstrates that retrieval models can be controlled with prompts on a\nper-query basis, setting the stage for future work aligning LM prompting\ntechniques with information retrieval.\n","authors":["Orion Weller","Benjamin Van Durme","Dawn Lawrie","Ashwin Paranjape","Yuhao Zhang","Jack Hessel"],"pdf_url":"https://arxiv.org/pdf/2409.11136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2403.18697v3","updated":"2024-09-17T12:27:07Z","published":"2024-03-27T15:46:25Z","title":"The Invalsi Benchmarks: measuring Linguistic and Mathematical\n understanding of Large Language Models in Italian","summary":" While Italian is a high-resource language, there are few Italian-native\nbenchmarks to evaluate generative Large Language Models (LLMs) in this\nlanguage. This work presents three new benchmarks: Invalsi MATE to evaluate\nmodels performance on mathematical understanding in Italian, Invalsi ITA to\nevaluate language understanding in Italian and Olimpiadi MATE for more complex\nmathematical understanding.\n The first two benchmarks are based on the Invalsi tests, which are\nadministered to students of age between 6 and 18 within the Italian school\nsystem and have been validated by several experts in teaching and pedagogy, the\nthird one comes from the Italian high school math Olympics.\n We evaluate 10 powerful language models on these benchmarks and find that\nthey are bound by 71% accuracy on Invasli MATE, achieved by Llama 3.1 70b\ninstruct and by 88% on Invalsi ITA. For both Invalsi MATE and Invalsi ITA we\ncompare LLMs with the average performance of Italian students to show that\nLlama 3.1 is the only one to outperform them on Invalsi MATE while most models\ndo so on Invalsi ITA, we then show that Olimpiadi MATE is more challenging than\nInvalsi MATE and the highest accuracy, achieved by Llama 3.1 405b instruct is\n45%.\n We will make data and evaluation code openly available upon acceptance of the\npaper.\n","authors":["Giovanni Puccetti","Maria Cassese","Andrea Esuli"],"pdf_url":"https://arxiv.org/pdf/2403.18697v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10482v2","updated":"2024-09-17T12:10:49Z","published":"2024-09-16T17:18:11Z","title":"Schrodinger's Memory: Large Language Models","summary":" Memory is the foundation of all human activities; without memory, it would be\nnearly impossible for people to perform any task in daily life. With the\ndevelopment of Large Language Models (LLMs), their language capabilities are\nbecoming increasingly comparable to those of humans. But do LLMs have memory?\nBased on current performance, LLMs do appear to exhibit memory. So, what is the\nunderlying mechanism of this memory? Previous research has lacked a deep\nexploration of LLMs' memory capabilities and the underlying theory. In this\npaper, we use Universal Approximation Theorem (UAT) to explain the memory\nmechanism in LLMs. We also conduct experiments to verify the memory\ncapabilities of various LLMs, proposing a new method to assess their abilities\nbased on these memory ability. We argue that LLM memory operates like\nSchr\\\"odinger's memory, meaning that it only becomes observable when a specific\nmemory is queried. We can only determine if the model retains a memory based on\nits output in response to the query; otherwise, it remains indeterminate.\nFinally, we expand on this concept by comparing the memory capabilities of the\nhuman brain and LLMs, highlighting the similarities and differences in their\noperational mechanisms.\n","authors":["Wei Wang","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2409.10482v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11112v1","updated":"2024-09-17T12:06:05Z","published":"2024-09-17T12:06:05Z","title":"Strategic Insights in Human and Large Language Model Tactics at Word\n Guessing Games","summary":" At the beginning of 2022, a simplistic word-guessing game took the world by\nstorm and was further adapted to many languages beyond the original English\nversion. In this paper, we examine the strategies of daily word-guessing game\nplayers that have evolved during a period of over two years. A survey gathered\nfrom 25% of frequent players reveals their strategies and motivations for\ncontinuing the daily journey. We also explore the capability of several popular\nopen-access large language model systems and open-source models at\ncomprehending and playing the game in two different languages. Results\nhighlight the struggles of certain models to maintain correct guess length and\ngenerate repetitions, as well as hallucinations of non-existent words and\ninflections.\n","authors":["Matīss Rikters","Sanita Reinsone"],"pdf_url":"https://arxiv.org/pdf/2409.11112v1.pdf","comment":"Published in the 4th Wordplay: When Language Meets Games Workshop @\n ACL 2024"},{"id":"http://arxiv.org/abs/2406.15796v3","updated":"2024-09-17T12:00:10Z","published":"2024-06-22T09:40:07Z","title":"Unveiling Entity-Level Unlearning for Large Language Models: A\n Comprehensive Analysis","summary":" Large language model unlearning has garnered increasing attention due to its\npotential to address security and privacy concerns, leading to extensive\nresearch in the field. However, much of this research has concentrated on\ninstance-level unlearning, specifically targeting the removal of predefined\ninstances containing sensitive content. This focus has left a significant gap\nin the exploration of full entity-level unlearning, which is critical in\nreal-world scenarios such as copyright protection. To this end, we propose a\nnovel task of Entity-level unlearning, which aims to erase entity-related\nknowledge from the target model completely. To thoroughly investigate this\ntask, we systematically evaluate trending unlearning algorithms, revealing that\ncurrent methods struggle to achieve effective entity-level unlearning. Then, we\nfurther explore the factors that influence the performance of the unlearning\nalgorithms, identifying that knowledge coverage and the size of the forget set\nplay pivotal roles. Notably, our analysis also uncovers that entities\nintroduced through fine-tuning are more vulnerable to unlearning than\npre-trained entities. These findings collectively offer valuable insights for\nadvancing entity-level unlearning for LLMs.\n","authors":["Weitao Ma","Xiaocheng Feng","Weihong Zhong","Lei Huang","Yangfan Ye","Xiachong Feng","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2406.15796v3.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2409.05486v2","updated":"2024-09-17T11:41:28Z","published":"2024-09-09T10:30:00Z","title":"Elsevier Arena: Human Evaluation of Chemistry/Biology/Health\n Foundational Large Language Models","summary":" arXiv admin comment: This version has been removed by arXiv administrators as\nthe submitter did not have the rights to agree to the license at the time of\nsubmission\n","authors":["Camilo Thorne","Christian Druckenbrodt","Kinga Szarkowska","Deepika Goyal","Pranita Marajan","Vijay Somanath","Corey Harper","Mao Yan","Tony Scerri"],"pdf_url":"https://arxiv.org/pdf/2409.05486v2.pdf","comment":"This document was submitted without obtaining all necessary\n permissions and therefore needs to be withdrawn. The corresponding author\n apologizes for any inconvenience this might cause"},{"id":"http://arxiv.org/abs/2409.09831v2","updated":"2024-09-17T11:18:37Z","published":"2024-09-15T19:11:01Z","title":"Generating Synthetic Free-text Medical Records with Low\n Re-identification Risk using Masked Language Modeling","summary":" In this paper, we present a system that generates synthetic free-text medical\nrecords, such as discharge summaries, admission notes and doctor\ncorrespondences, using Masked Language Modeling (MLM). Our system is designed\nto preserve the critical information of the records while introducing\nsignificant diversity and minimizing re-identification risk. The system\nincorporates a de-identification component that uses Philter to mask Protected\nHealth Information (PHI), followed by a Medical Entity Recognition (NER) model\nto retain key medical information. We explore various masking ratios and\nmask-filling techniques to balance the trade-off between diversity and fidelity\nin the synthetic outputs without affecting overall readability. Our results\ndemonstrate that the system can produce high-quality synthetic data with\nsignificant diversity while achieving a HIPAA-compliant PHI recall rate of 0.96\nand a low re-identification risk of 0.035. Furthermore, downstream evaluations\nusing a NER task reveal that the synthetic data can be effectively used to\ntrain models with performance comparable to those trained on real data. The\nflexibility of the system allows it to be adapted for specific use cases,\nmaking it a valuable tool for privacy-preserving data generation in medical\nresearch and healthcare applications.\n","authors":["Samuel Belkadi","Libo Ren","Nicolo Micheletti","Lifeng Han","Goran Nenadic"],"pdf_url":"https://arxiv.org/pdf/2409.09831v2.pdf","comment":"Added references and rephrased some sentences"},{"id":"http://arxiv.org/abs/2310.10679v2","updated":"2024-09-17T10:47:51Z","published":"2023-10-12T11:17:23Z","title":"Large language models can replicate cross-cultural differences in\n personality","summary":" We use a large-scale experiment (N=8000) to determine whether GPT-4 can\nreplicate cross-cultural differences in the Big Five, measured using the\nTen-Item Personality Inventory. We used the US and South Korea as the cultural\npair, given that prior research suggests substantial personality differences\nbetween people from these two countries. We manipulated the target of the\nsimulation (US vs. Korean), the language of the inventory (English vs. Korean),\nand the language model (GPT-4 vs. GPT-3.5). Our results show that GPT-4\nreplicated the cross-cultural differences for each factor. However, mean\nratings had an upward bias and exhibited lower variation than in the human\nsamples, as well as lower structural validity. We provide preliminary evidence\nthat LLMs can aid cross-cultural researchers and practitioners.\n","authors":["Paweł Niszczota","Mateusz Janczak","Michał Misiak"],"pdf_url":"https://arxiv.org/pdf/2310.10679v2.pdf","comment":"27 pages: 12 pages of manuscript + 15 pages of supplementary\n materials"},{"id":"http://arxiv.org/abs/2409.11057v1","updated":"2024-09-17T10:35:30Z","published":"2024-09-17T10:35:30Z","title":"KVPruner: Structural Pruning for Faster and Memory-Efficient Large\n Language Models","summary":" The bottleneck associated with the key-value(KV) cache presents a significant\nchallenge during the inference processes of large language models. While depth\npruning accelerates inference, it requires extensive recovery training, which\ncan take up to two weeks. On the other hand, width pruning retains much of the\nperformance but offers slight speed gains. To tackle these challenges, we\npropose KVPruner to improve model efficiency while maintaining performance. Our\nmethod uses global perplexity-based analysis to determine the importance ratio\nfor each block and provides multiple strategies to prune non-essential KV\nchannels within blocks. Compared to the original model, KVPruner reduces\nruntime memory usage by 50% and boosts throughput by over 35%. Additionally,\nour method requires only two hours of LoRA fine-tuning on small datasets to\nrecover most of the performance.\n","authors":["Bo Lv","Quan Zhou","Xuanang Ding","Yan Wang","Zeming Ma"],"pdf_url":"https://arxiv.org/pdf/2409.11057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11056v1","updated":"2024-09-17T10:33:27Z","published":"2024-09-17T10:33:27Z","title":"Large Language Models are Good Multi-lingual Learners : When LLMs Meet\n Cross-lingual Prompts","summary":" With the advent of Large Language Models (LLMs), generating rule-based data\nfor real-world applications has become more accessible. Due to the inherent\nambiguity of natural language and the complexity of rule sets, especially in\nlong contexts, LLMs often struggle to follow all specified rules, frequently\nomitting at least one. To enhance the reasoning and understanding of LLMs on\nlong and complex contexts, we propose a novel prompting strategy Multi-Lingual\nPrompt, namely MLPrompt, which automatically translates the error-prone rule\nthat an LLM struggles to follow into another language, thus drawing greater\nattention to it. Experimental results on public datasets across various tasks\nhave shown MLPrompt can outperform state-of-the-art prompting methods such as\nChain of Thought, Tree of Thought, and Self-Consistency. Additionally, we\nintroduce a framework integrating MLPrompt with an auto-checking mechanism for\nstructured data generation, with a specific case study in text-to-MIP\ninstances. Further, we extend the proposed framework for text-to-SQL to\ndemonstrate its generation ability towards structured data synthesis.\n","authors":["Teng Wang","Zhenqi He","Wing-Yin Yu","Xiaojin Fu","Xiongwei Han"],"pdf_url":"https://arxiv.org/pdf/2409.11056v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11055v1","updated":"2024-09-17T10:31:37Z","published":"2024-09-17T10:31:37Z","title":"A Comprehensive Evaluation of Quantized Instruction-Tuned Large Language\n Models: An Experimental Analysis up to 405B","summary":" Prior research works have evaluated quantized LLMs using limited metrics such\nas perplexity or a few basic knowledge tasks and old datasets. Additionally,\nrecent large-scale models such as Llama 3.1 with up to 405B have not been\nthoroughly examined. This paper evaluates the performance of instruction-tuned\nLLMs across various quantization methods (GPTQ, AWQ, SmoothQuant, and FP8) on\nmodels ranging from 7B to 405B. Using 13 benchmarks, we assess performance\nacross six task types: commonsense Q\\&A, knowledge and language understanding,\ninstruction following, hallucination detection, mathematics, and dialogue. Our\nkey findings reveal that (1) quantizing a larger LLM to a similar size as a\nsmaller FP16 LLM generally performs better across most benchmarks, except for\nhallucination detection and instruction following; (2) performance varies\nsignificantly with different quantization methods, model size, and bit-width,\nwith weight-only methods often yielding better results in larger models; (3)\ntask difficulty does not significantly impact accuracy degradation due to\nquantization; and (4) the MT-Bench evaluation method has limited discriminatory\npower among recent high-performing LLMs.\n","authors":["Jemin Lee","Sihyeong Park","Jinse Kwon","Jihun Oh","Yongin Kwon"],"pdf_url":"https://arxiv.org/pdf/2409.11055v1.pdf","comment":"11 pages, 1 figure"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2406.15920v3","updated":"2024-09-17T23:32:57Z","published":"2024-06-22T19:20:35Z","title":"SEDMamba: Enhancing Selective State Space Modelling with Bottleneck\n Mechanism and Fine-to-Coarse Temporal Fusion for Efficient Error Detection in\n Robot-Assisted Surgery","summary":" Automated detection of surgical errors can improve robotic-assisted surgery.\nDespite promising progress, existing methods still face challenges in capturing\nrich temporal context to establish long-term dependencies while maintaining\ncomputational efficiency. In this paper, we propose a novel hierarchical model\nnamed SEDMamba, which incorporates the selective state space model (SSM) into\nsurgical error detection, facilitating efficient long sequence modelling with\nlinear complexity. SEDMamba enhances selective SSM with a bottleneck mechanism\nand fine-to-coarse temporal fusion (FCTF) to detect and temporally localize\nsurgical errors in long videos. The bottleneck mechanism compresses and\nrestores features within their spatial dimension, thereby reducing\ncomputational complexity. FCTF utilizes multiple dilated 1D convolutional\nlayers to merge temporal information across diverse scale ranges, accommodating\nerrors of varying duration. Our work also contributes the first-of-its-kind,\nframe-level, in-vivo surgical error dataset to support error detection in real\nsurgical cases. Specifically, we deploy the clinically validated observational\nclinical human reliability assessment tool (OCHRA) to annotate the errors\nduring suturing tasks in an open-source radical prostatectomy dataset\n(SAR-RARP50). Experimental results demonstrate that our SEDMamba outperforms\nstate-of-the-art methods with at least 1.82% AUC and 3.80% AP performance gains\nwith significantly reduced computational complexity. The corresponding error\nannotations, code and models will be released at\nhttps://github.com/wzjialang/SEDMamba.\n","authors":["Jialang Xu","Nazir Sirajudeen","Matthew Boal","Nader Francis","Danail Stoyanov","Evangelos Mazomenos"],"pdf_url":"https://arxiv.org/pdf/2406.15920v3.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2409.02253v2","updated":"2024-09-17T23:25:27Z","published":"2024-09-03T19:26:13Z","title":"How to Determine the Preferred Image Distribution of a Black-Box\n Vision-Language Model?","summary":" Large foundation models have revolutionized the field, yet challenges remain\nin optimizing multi-modal models for specialized visual tasks. We propose a\nnovel, generalizable methodology to identify preferred image distributions for\nblack-box Vision-Language Models (VLMs) by measuring output consistency across\nvaried input prompts. Applying this to different rendering types of 3D objects,\nwe demonstrate its efficacy across various domains requiring precise\ninterpretation of complex structures, with a focus on Computer-Aided Design\n(CAD) as an exemplar field. We further refine VLM outputs using in-context\nlearning with human feedback, significantly enhancing explanation quality. To\naddress the lack of benchmarks in specialized domains, we introduce CAD-VQA, a\nnew dataset for evaluating VLMs on CAD-related visual question answering tasks.\nOur evaluation of state-of-the-art VLMs on CAD-VQA establishes baseline\nperformance levels, providing a framework for advancing VLM capabilities in\ncomplex visual reasoning tasks across various fields requiring expert-level\nvisual interpretation. We release the dataset and evaluation codes at\n\\url{https://github.com/asgsaeid/cad_vqa}.\n","authors":["Saeid Asgari Taghanaki","Joseph Lambourne","Alana Mongkhounsavath"],"pdf_url":"https://arxiv.org/pdf/2409.02253v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11593v1","updated":"2024-09-17T22:58:20Z","published":"2024-09-17T22:58:20Z","title":"Self-Contrastive Forward-Forward Algorithm","summary":" The Forward-Forward (FF) algorithm is a recent, purely forward-mode learning\nmethod, that updates weights locally and layer-wise and supports supervised as\nwell as unsupervised learning. These features make it ideal for applications\nsuch as brain-inspired learning, low-power hardware neural networks, and\ndistributed learning in large models. However, while FF has shown promise on\nwritten digit recognition tasks, its performance on natural images and\ntime-series remains a challenge. A key limitation is the need to generate\nhigh-quality negative examples for contrastive learning, especially in\nunsupervised tasks, where versatile solutions are currently lacking. To address\nthis, we introduce the Self-Contrastive Forward-Forward (SCFF) method, inspired\nby self-supervised contrastive learning. SCFF generates positive and negative\nexamples applicable across different datasets, surpassing existing local\nforward algorithms for unsupervised classification accuracy on MNIST (MLP:\n98.7%), CIFAR-10 (CNN: 80.75%), and STL-10 (CNN: 77.3%). Additionally, SCFF is\nthe first to enable FF training of recurrent neural networks, opening the door\nto more complex tasks and continuous-time video and text processing.\n","authors":["Xing Chen","Dongshu Liu","Jeremie Laydevant","Julie Grollier"],"pdf_url":"https://arxiv.org/pdf/2409.11593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06344v2","updated":"2024-09-17T21:31:09Z","published":"2024-01-12T03:26:06Z","title":"Hyper-STTN: Social Group-aware Spatial-Temporal Transformer Network for\n Human Trajectory Prediction with Hypergraph Reasoning","summary":" Predicting crowded intents and trajectories is crucial in varouls real-world\napplications, including service robots and autonomous vehicles. Understanding\nenvironmental dynamics is challenging, not only due to the complexities of\nmodeling pair-wise spatial and temporal interactions but also the diverse\ninfluence of group-wise interactions. To decode the comprehensive pair-wise and\ngroup-wise interactions in crowded scenarios, we introduce Hyper-STTN, a\nHypergraph-based Spatial-Temporal Transformer Network for crowd trajectory\nprediction. In Hyper-STTN, crowded group-wise correlations are constructed\nusing a set of multi-scale hypergraphs with varying group sizes, captured\nthrough random-walk robability-based hypergraph spectral convolution.\nAdditionally, a spatial-temporal transformer is adapted to capture pedestrians'\npair-wise latent interactions in spatial-temporal dimensions. These\nheterogeneous group-wise and pair-wise are then fused and aligned though a\nmultimodal transformer network. Hyper-STTN outperformes other state-of-the-art\nbaselines and ablation models on 5 real-world pedestrian motion datasets.\n","authors":["Weizheng Wang","Chaowei Wang","Baijian Yang","Guohua Chen","Byung-Cheol Min"],"pdf_url":"https://arxiv.org/pdf/2401.06344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04743v5","updated":"2024-09-17T21:29:04Z","published":"2023-05-01T02:58:48Z","title":"MARS: Mask Attention Refinement with Sequential Quadtree Nodes for Car\n Damage Instance Segmentation","summary":" Evaluating car damages from misfortune is critical to the car insurance\nindustry. However, the accuracy is still insufficient for real-world\napplications since the deep learning network is not designed for car damage\nimages as inputs, and its segmented masks are still very coarse. This paper\npresents MARS (Mask Attention Refinement with Sequential quadtree nodes) for\ncar damage instance segmentation. Our MARS represents self-attention mechanisms\nto draw global dependencies between the sequential quadtree nodes layer and\nquadtree transformer to recalibrate channel weights and predict highly accurate\ninstance masks. Our extensive experiments demonstrate that MARS outperforms\nstate-of-the-art (SOTA) instance segmentation methods on three popular\nbenchmarks such as Mask R-CNN [9], PointRend [13], and Mask Transfiner [12], by\na large margin of +1.3 maskAP-based R50-FPN backbone and +2.3 maskAP-based\nR101-FPN backbone on Thai car-damage dataset. Our demos are available at\nhttps://github.com/kaopanboonyuen/MARS.\n","authors":["Teerapong Panboonyuen","Naphat Nithisopa","Panin Pienroj","Laphonchai Jirachuphun","Chaiwasut Watthanasirikrit","Naruepon Pornwiriyakul"],"pdf_url":"https://arxiv.org/pdf/2305.04743v5.pdf","comment":"14 pages. arXiv admin note: substantial text overlap with\n arXiv:2111.13673 by other authors"},{"id":"http://arxiv.org/abs/2409.11564v1","updated":"2024-09-17T21:28:51Z","published":"2024-09-17T21:28:51Z","title":"Preference Tuning with Human Feedback on Language, Speech, and Vision\n Tasks: A Survey","summary":" Preference tuning is a crucial process for aligning deep generative models\nwith human preferences. This survey offers a thorough overview of recent\nadvancements in preference tuning and the integration of human feedback. The\npaper is organized into three main sections: 1) introduction and preliminaries:\nan introduction to reinforcement learning frameworks, preference tuning tasks,\nmodels, and datasets across various modalities: language, speech, and vision,\nas well as different policy approaches, 2) in-depth examination of each\npreference tuning approach: a detailed analysis of the methods used in\npreference tuning, and 3) applications, discussion, and future directions: an\nexploration of the applications of preference tuning in downstream tasks,\nincluding evaluation methods for different modalities, and an outlook on future\nresearch directions. Our objective is to present the latest methodologies in\npreference tuning and model alignment, enhancing the understanding of this\nfield for researchers and practitioners. We hope to encourage further\nengagement and innovation in this area.\n","authors":["Genta Indra Winata","Hanyang Zhao","Anirban Das","Wenpin Tang","David D. Yao","Shi-Xiong Zhang","Sambit Sahu"],"pdf_url":"https://arxiv.org/pdf/2409.11564v1.pdf","comment":"Survey paper"},{"id":"http://arxiv.org/abs/2409.00314v2","updated":"2024-09-17T21:26:09Z","published":"2024-08-31T00:52:29Z","title":"Towards Secure and Usable 3D Assets: A Novel Framework for Automatic\n Visible Watermarking","summary":" 3D models, particularly AI-generated ones, have witnessed a recent surge\nacross various industries such as entertainment. Hence, there is an alarming\nneed to protect the intellectual property and avoid the misuse of these\nvaluable assets. As a viable solution to address these concerns, we rigorously\ndefine the novel task of automated 3D visible watermarking in terms of two\ncompeting aspects: watermark quality and asset utility. Moreover, we propose a\nmethod of embedding visible watermarks that automatically determines the right\nlocation, orientation, and number of watermarks to be placed on arbitrary 3D\nassets for high watermark quality and asset utility. Our method is based on a\nnovel rigid-body optimization that uses back-propagation to automatically learn\ntransforms for ideal watermark placement. In addition, we propose a novel\ncurvature-matching method for fusing the watermark into the 3D model that\nfurther improves readability and security. Finally, we provide a detailed\nexperimental analysis on two benchmark 3D datasets validating the superior\nperformance of our approach in comparison to baselines. Code and demo are\navailable.\n","authors":["Gursimran Singh","Tianxi Hu","Mohammad Akbari","Qiang Tang","Yong Zhang"],"pdf_url":"https://arxiv.org/pdf/2409.00314v2.pdf","comment":"Accepted to WACV2025"},{"id":"http://arxiv.org/abs/2407.13862v2","updated":"2024-09-17T21:17:54Z","published":"2024-07-18T19:15:52Z","title":"Enhancing Worldwide Image Geolocation by Ensembling Satellite-Based\n Ground-Level Attribute Predictors","summary":" We examine the challenge of estimating the location of a single ground-level\nimage in the absence of GPS or other location metadata. Currently, geolocation\nsystems are evaluated by measuring the Great Circle Distance between the\npredicted location and ground truth. Because this measurement only uses a\nsingle point, it cannot assess the distribution of predictions by geolocation\nsystems. Evaluation of a distribution of potential locations (areas) is\nrequired when there are follow-on procedures to further narrow down or verify\nthe location. This is especially important in poorly-sampled regions e.g. rural\nand wilderness areas.\n In this paper, we introduce a novel metric, Recall vs Area (RvA), which\nmeasures the accuracy of estimated distributions of locations. RvA treats image\ngeolocation results similarly to document retrieval, measuring recall as a\nfunction of area: For a ranked list of (possibly discontiguous) predicted\nregions, we measure the area required for accumulated regions to contain the\nground truth coordinate. This produces a curve similar to a precision-recall\ncurve, where \"precision\" is replaced by square kilometers area, enabling\nevaluation for different downstream search area budgets.\n Following from this view of the problem, we then examine an ensembling\napproach to global-scale image geolocation, which incorporates information from\nmultiple sources, and can readily incorporate multiple models, attribute\npredictors, and data sources. We study its effectiveness by combining the\ngeolocation models GeoEstimation and the current state-of-the-art, GeoCLIP,\nwith attribute predictors based on Oak Ridge National Laboratory LandScan and\nEuropean Space Agency Climate Change Initiative Land Cover. We find significant\nimprovements in image geolocation for areas that are under-represented in the\ntraining set, particularly non-urban areas, on both Im2GPS3k and Street View\nimages.\n","authors":["Michael J. Bianco","David Eigen","Michael Gormish"],"pdf_url":"https://arxiv.org/pdf/2407.13862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11555v1","updated":"2024-09-17T20:53:47Z","published":"2024-09-17T20:53:47Z","title":"Open-Set Semantic Uncertainty Aware Metric-Semantic Graph Matching","summary":" Underwater object-level mapping requires incorporating visual foundation\nmodels to handle the uncommon and often previously unseen object classes\nencountered in marine scenarios. In this work, a metric of semantic uncertainty\nfor open-set object detections produced by visual foundation models is\ncalculated and then incorporated into an object-level uncertainty tracking\nframework. Object-level uncertainties and geometric relationships between\nobjects are used to enable robust object-level loop closure detection for\nunknown object classes. The above loop closure detection problem is formulated\nas a graph-matching problem. While graph matching, in general, is NP-Complete,\na solver for an equivalent formulation of the proposed graph matching problem\nas a graph editing problem is tested on multiple challenging underwater scenes.\nResults for this solver as well as three other solvers demonstrate that the\nproposed methods are feasible for real-time use in marine environments for the\nrobust, open-set, multi-object, semantic-uncertainty-aware loop closure\ndetection. Further experimental results on the KITTI dataset demonstrate that\nthe method generalizes to large-scale terrestrial scenes.\n","authors":["Kurran Singh","John J. Leonard"],"pdf_url":"https://arxiv.org/pdf/2409.11555v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11552v1","updated":"2024-09-17T20:47:32Z","published":"2024-09-17T20:47:32Z","title":"Multi-Domain Data Aggregation for Axon and Myelin Segmentation in\n Histology Images","summary":" Quantifying axon and myelin properties (e.g., axon diameter, myelin\nthickness, g-ratio) in histology images can provide useful information about\nmicrostructural changes caused by neurodegenerative diseases. Automatic tissue\nsegmentation is an important tool for these datasets, as a single stained\nsection can contain up to thousands of axons. Advances in deep learning have\nmade this task quick and reliable with minimal overhead, but a deep learning\nmodel trained by one research group will hardly ever be usable by other groups\ndue to differences in their histology training data. This is partly due to\nsubject diversity (different body parts, species, genetics, pathologies) and\nalso to the range of modern microscopy imaging techniques resulting in a wide\nvariability of image features (i.e., contrast, resolution). There is a pressing\nneed to make AI accessible to neuroscience researchers to facilitate and\naccelerate their workflow, but publicly available models are scarce and poorly\nmaintained. Our approach is to aggregate data from multiple imaging modalities\n(bright field, electron microscopy, Raman spectroscopy) and species (mouse,\nrat, rabbit, human), to create an open-source, durable tool for axon and myelin\nsegmentation. Our generalist model makes it easier for researchers to process\ntheir data and can be fine-tuned for better performance on specific domains. We\nstudy the benefits of different aggregation schemes. This multi-domain\nsegmentation model performs better than single-modality dedicated learners\n(p=0.03077), generalizes better on out-of-distribution data and is easier to\nuse and maintain. Importantly, we package the segmentation tool into a\nwell-maintained open-source software ecosystem (see\nhttps://github.com/axondeepseg/axondeepseg).\n","authors":["Armand Collin","Arthur Boschet","Mathieu Boudreau","Julien Cohen-Adad"],"pdf_url":"https://arxiv.org/pdf/2409.11552v1.pdf","comment":"12 pages, 8 figures"},{"id":"http://arxiv.org/abs/2402.10259v3","updated":"2024-09-17T20:46:03Z","published":"2024-02-15T18:42:33Z","title":"GaussianObject: High-Quality 3D Object Reconstruction from Four Views\n with Gaussian Splatting","summary":" Reconstructing and rendering 3D objects from highly sparse views is of\ncritical importance for promoting applications of 3D vision techniques and\nimproving user experience. However, images from sparse views only contain very\nlimited 3D information, leading to two significant challenges: 1) Difficulty in\nbuilding multi-view consistency as images for matching are too few; 2)\nPartially omitted or highly compressed object information as view coverage is\ninsufficient. To tackle these challenges, we propose GaussianObject, a\nframework to represent and render the 3D object with Gaussian splatting that\nachieves high rendering quality with only 4 input images. We first introduce\ntechniques of visual hull and floater elimination, which explicitly inject\nstructure priors into the initial optimization process to help build multi-view\nconsistency, yielding a coarse 3D Gaussian representation. Then we construct a\nGaussian repair model based on diffusion models to supplement the omitted\nobject information, where Gaussians are further refined. We design a\nself-generating strategy to obtain image pairs for training the repair model.\nWe further design a COLMAP-free variant, where pre-given accurate camera poses\nare not required, which achieves competitive quality and facilitates wider\napplications. GaussianObject is evaluated on several challenging datasets,\nincluding MipNeRF360, OmniObject3D, OpenIllumination, and our-collected unposed\nimages, achieving superior performance from only four views and significantly\noutperforming previous SOTA methods.\n","authors":["Chen Yang","Sikuang Li","Jiemin Fang","Ruofan Liang","Lingxi Xie","Xiaopeng Zhang","Wei Shen","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2402.10259v3.pdf","comment":"Project page: https://gaussianobject.github.io/"},{"id":"http://arxiv.org/abs/2409.11546v1","updated":"2024-09-17T20:36:03Z","published":"2024-09-17T20:36:03Z","title":"NCT-CRC-HE: Not All Histopathological Datasets Are Equally Useful","summary":" Numerous deep learning-based solutions have been proposed for\nhistopathological image analysis over the past years. While they usually\ndemonstrate exceptionally high accuracy, one key question is whether their\nprecision might be affected by low-level image properties not related to\nhistopathology but caused by microscopy image handling and pre-processing. In\nthis paper, we analyze a popular NCT-CRC-HE-100K colorectal cancer dataset used\nin numerous prior works and show that both this dataset and the obtained\nresults may be affected by data-specific biases. The most prominent revealed\ndataset issues are inappropriate color normalization, severe JPEG artifacts\ninconsistent between different classes, and completely corrupted tissue samples\nresulting from incorrect image dynamic range handling. We show that even the\nsimplest model using only 3 features per image (red, green and blue color\nintensities) can demonstrate over 50% accuracy on this 9-class dataset, while\nusing color histogram not explicitly capturing cell morphology features yields\nover 82% accuracy. Moreover, we show that a basic EfficientNet-B0 ImageNet\npretrained model can achieve over 97.7% accuracy on this dataset, outperforming\nall previously proposed solutions developed for this task, including dedicated\nfoundation histopathological models and large cell morphology-aware neural\nnetworks. The NCT-CRC-HE dataset is publicly available and can be freely used\nto replicate the presented results. The codes and pre-trained models used in\nthis paper are available at\nhttps://github.com/gmalivenko/NCT-CRC-HE-experiments\n","authors":["Andrey Ignatov","Grigory Malivenko"],"pdf_url":"https://arxiv.org/pdf/2409.11546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11542v1","updated":"2024-09-17T20:30:35Z","published":"2024-09-17T20:30:35Z","title":"VALO: A Versatile Anytime Framework for LiDAR-based Object Detection\n Deep Neural Networks","summary":" This work addresses the challenge of adapting dynamic deadline requirements\nfor LiDAR object detection deep neural networks (DNNs). The computing latency\nof object detection is critically important to ensure safe and efficient\nnavigation. However, state-of-the-art LiDAR object detection DNNs often exhibit\nsignificant latency, hindering their real-time performance on\nresource-constrained edge platforms. Therefore, a tradeoff between detection\naccuracy and latency should be dynamically managed at runtime to achieve\noptimum results.\n In this paper, we introduce VALO (Versatile Anytime algorithm for LiDAR\nObject detection), a novel data-centric approach that enables anytime computing\nof 3D LiDAR object detection DNNs. VALO employs a deadline-aware scheduler to\nselectively process input regions, making execution time and accuracy tradeoffs\nwithout architectural modifications. Additionally, it leverages efficient\nforecasting of past detection results to mitigate possible loss of accuracy due\nto partial processing of input. Finally, it utilizes a novel input reduction\ntechnique within its detection heads to significantly accelerate execution\nwithout sacrificing accuracy.\n We implement VALO on state-of-the-art 3D LiDAR object detection networks,\nnamely CenterPoint and VoxelNext, and demonstrate its dynamic adaptability to a\nwide range of time constraints while achieving higher accuracy than the prior\nstate-of-the-art. Code is available\nathttps://github.com/CSL-KU/VALO}{github.com/CSL-KU/VALO.\n","authors":["Ahmet Soyyigit","Shuochao Yao","Heechul Yun"],"pdf_url":"https://arxiv.org/pdf/2409.11542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11536v1","updated":"2024-09-17T20:13:54Z","published":"2024-09-17T20:13:54Z","title":"Obfuscation Based Privacy Preserving Representations are Recoverable\n Using Neighborhood Information","summary":" Rapid growth in the popularity of AR/VR/MR applications and cloud-based\nvisual localization systems has given rise to an increased focus on the privacy\nof user content in the localization process.\n This privacy concern has been further escalated by the ability of deep neural\nnetworks to recover detailed images of a scene from a sparse set of 3D or 2D\npoints and their descriptors - the so-called inversion attacks.\n Research on privacy-preserving localization has therefore focused on\npreventing these inversion attacks on both the query image keypoints and the 3D\npoints of the scene map.\n To this end, several geometry obfuscation techniques that lift points to\nhigher-dimensional spaces, i.e., lines or planes, or that swap coordinates\nbetween points % have been proposed.\n In this paper, we point to a common weakness of these obfuscations that\nallows to recover approximations of the original point positions under the\nassumption of known neighborhoods.\n We further show that these neighborhoods can be computed by learning to\nidentify descriptors that co-occur in neighborhoods.\n Extensive experiments show that our approach for point recovery is\npractically applicable to all existing geometric obfuscation schemes.\n Our results show that these schemes should not be considered\nprivacy-preserving, even though they are claimed to be privacy-preserving.\n Code will be available at\n\\url{https://github.com/kunalchelani/RecoverPointsNeighborhood}.\n","authors":["Kunal Chelani","Assia Benbihi","Fredrik Kahl","Torsten Sattler","Zuzana Kukelova"],"pdf_url":"https://arxiv.org/pdf/2409.11536v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11534v1","updated":"2024-09-17T20:12:50Z","published":"2024-09-17T20:12:50Z","title":"Unsupervised Hybrid framework for ANomaly Detection (HAND) -- applied to\n Screening Mammogram","summary":" Out-of-distribution (OOD) detection is crucial for enhancing the\ngeneralization of AI models used in mammogram screening. Given the challenge of\nlimited prior knowledge about OOD samples in external datasets, unsupervised\ngenerative learning is a preferable solution which trains the model to discern\nthe normal characteristics of in-distribution (ID) data. The hypothesis is that\nduring inference, the model aims to reconstruct ID samples accurately, while\nOOD samples exhibit poorer reconstruction due to their divergence from\nnormality. Inspired by state-of-the-art (SOTA) hybrid architectures combining\nCNNs and transformers, we developed a novel backbone - HAND, for detecting OOD\nfrom large-scale digital screening mammogram studies. To boost the learning\nefficiency, we incorporated synthetic OOD samples and a parallel discriminator\nin the latent space to distinguish between ID and OOD samples. Gradient\nreversal to the OOD reconstruction loss penalizes the model for learning OOD\nreconstructions. An anomaly score is computed by weighting the reconstruction\nand discriminator loss. On internal RSNA mammogram held-out test and external\nMayo clinic hand-curated dataset, the proposed HAND model outperformed\nencoder-based and GAN-based baselines, and interestingly, it also outperformed\nthe hybrid CNN+transformer baselines. Therefore, the proposed HAND pipeline\noffers an automated efficient computational solution for domain-specific\nquality checks in external screening mammograms, yielding actionable insights\nwithout direct exposure to the private medical imaging data.\n","authors":["Zhemin Zhang","Bhavika Patel","Bhavik Patel","Imon Banerjee"],"pdf_url":"https://arxiv.org/pdf/2409.11534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11518v1","updated":"2024-09-17T19:40:28Z","published":"2024-09-17T19:40:28Z","title":"Robot Manipulation in Salient Vision through Referring Image\n Segmentation and Geometric Constraints","summary":" In this paper, we perform robot manipulation activities in real-world\nenvironments with language contexts by integrating a compact referring image\nsegmentation model into the robot's perception module. First, we propose\nCLIPU$^2$Net, a lightweight referring image segmentation model designed for\nfine-grain boundary and structure segmentation from language expressions. Then,\nwe deploy the model in an eye-in-hand visual servoing system to enact robot\ncontrol in the real world. The key to our system is the representation of\nsalient visual information as geometric constraints, linking the robot's visual\nperception to actionable commands. Experimental results on 46 real-world robot\nmanipulation tasks demonstrate that our method outperforms traditional visual\nservoing methods relying on labor-intensive feature annotations, excels in\nfine-grain referring image segmentation with a compact decoder size of 6.6 MB,\nand supports robot control across diverse contexts.\n","authors":["Chen Jiang","Allie Luo","Martin Jagersand"],"pdf_url":"https://arxiv.org/pdf/2409.11518v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11513v1","updated":"2024-09-17T19:36:37Z","published":"2024-09-17T19:36:37Z","title":"Mamba Fusion: Learning Actions Through Questioning","summary":" Video Language Models (VLMs) are crucial for generalizing across diverse\ntasks and using language cues to enhance learning. While transformer-based\narchitectures have been the de facto in vision-language training, they face\nchallenges like quadratic computational complexity, high GPU memory usage, and\ndifficulty with long-term dependencies. To address these limitations, we\nintroduce MambaVL, a novel model that leverages recent advancements in\nselective state space modality fusion to efficiently capture long-range\ndependencies and learn joint representations for vision and language data.\nMambaVL utilizes a shared state transition matrix across both modalities,\nallowing the model to capture information about actions from multiple\nperspectives within the scene. Furthermore, we propose a question-answering\ntask that helps guide the model toward relevant cues. These questions provide\ncritical information about actions, objects, and environmental context, leading\nto enhanced performance. As a result, MambaVL achieves state-of-the-art\nperformance in action recognition on the Epic-Kitchens-100 dataset and\noutperforms baseline methods in action anticipation.\n","authors":["Zhikang Dong","Apoorva Beedu","Jason Sheinkopf","Irfan Essa"],"pdf_url":"https://arxiv.org/pdf/2409.11513v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11512v1","updated":"2024-09-17T19:26:21Z","published":"2024-09-17T19:26:21Z","title":"Good Grasps Only: A data engine for self-supervised fine-tuning of pose\n estimation using grasp poses for verification","summary":" In this paper, we present a novel method for self-supervised fine-tuning of\npose estimation for bin-picking. Leveraging zero-shot pose estimation, our\napproach enables the robot to automatically obtain training data without manual\nlabeling. After pose estimation the object is grasped, and in-hand pose\nestimation is used for data validation. Our pipeline allows the system to\nfine-tune while the process is running, removing the need for a learning phase.\n The motivation behind our work lies in the need for rapid setup of pose\nestimation solutions. Specifically, we address the challenging task of bin\npicking, which plays a pivotal role in flexible robotic setups.\n Our method is implemented on a robotics work-cell, and tested with four\ndifferent objects. For all objects, our method increases the performance and\noutperforms a state-of-the-art method trained on the CAD model of the objects.\n","authors":["Frederik Hagelskjær"],"pdf_url":"https://arxiv.org/pdf/2409.11512v1.pdf","comment":"8 pages, 7 figures, 3 tables"},{"id":"http://arxiv.org/abs/2409.11508v1","updated":"2024-09-17T19:22:29Z","published":"2024-09-17T19:22:29Z","title":"Retinal Vessel Segmentation with Deep Graph and Capsule Reasoning","summary":" Effective retinal vessel segmentation requires a sophisticated integration of\nglobal contextual awareness and local vessel continuity. To address this\nchallenge, we propose the Graph Capsule Convolution Network (GCC-UNet), which\nmerges capsule convolutions with CNNs to capture both local and global\nfeatures. The Graph Capsule Convolution operator is specifically designed to\nenhance the representation of global context, while the Selective Graph\nAttention Fusion module ensures seamless integration of local and global\ninformation. To further improve vessel continuity, we introduce the Bottleneck\nGraph Attention module, which incorporates Channel-wise and Spatial Graph\nAttention mechanisms. The Multi-Scale Graph Fusion module adeptly combines\nfeatures from various scales. Our approach has been rigorously validated\nthrough experiments on widely used public datasets, with ablation studies\nconfirming the efficacy of each component. Comparative results highlight\nGCC-UNet's superior performance over existing methods, setting a new benchmark\nin retinal vessel segmentation. Notably, this work represents the first\nintegration of vanilla, graph, and capsule convolutional techniques in the\ndomain of medical image segmentation.\n","authors":["Xinxu Wei","Xi Lin","Haiyun Liu","Shixuan Zhao","Yongjie Li"],"pdf_url":"https://arxiv.org/pdf/2409.11508v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2408.10404v2","updated":"2024-09-17T18:23:16Z","published":"2024-08-19T20:39:21Z","title":"Accelerating Point Cloud Ground Segmentation: From Mechanical to\n Solid-State Lidars","summary":" In this study, we propose a novel parallel processing method for point cloud\nground segmentation, aimed at the technology evolution from mechanical to\nsolid-state Lidar (SSL). We first benchmark point-based, grid-based, and range\nimage-based ground segmentation algorithms using the SemanticKITTI dataset. Our\nresults indicate that the range image-based method offers superior performance\nand robustness, particularly in resilience to frame slicing. Implementing the\nproposed algorithm on an FPGA demonstrates significant improvements in\nprocessing speed and scalability of resource usage. Additionally, we develop a\ncustom dataset using camera-SSL equipment on our test vehicle to validate the\neffectiveness of the parallel processing approach for SSL frames in real world,\nachieving processing rates up to 30.9 times faster than CPU implementations.\nThese findings underscore the potential of parallel processing strategies to\nenhance Lidar technologies for advanced perception tasks in autonomous vehicles\nand robotics. The data and code will be available post-publication on our\nGitHub repository:\n\\url{https://github.com/WPI-APA-Lab/GroundSeg-Solid-State-Lidar-Parallel-Processing}\n","authors":["Xiao Zhang","Zhanhong Huang","Garcia Gonzalez Antony","Xinming Huang"],"pdf_url":"https://arxiv.org/pdf/2408.10404v2.pdf","comment":"6 pages"},{"id":"http://arxiv.org/abs/2409.08345v2","updated":"2024-09-17T18:19:24Z","published":"2024-09-12T18:18:02Z","title":"SIG: A Synthetic Identity Generation Pipeline for Generating Evaluation\n Datasets for Face Recognition","summary":" As Artificial Intelligence applications expand, the evaluation of models\nfaces heightened scrutiny. Ensuring public readiness requires evaluation\ndatasets, which differ from training data by being disjoint and ethically\nsourced in compliance with privacy regulations. The performance and fairness of\nface recognition systems depend significantly on the quality and\nrepresentativeness of these evaluation datasets. This data is sometimes scraped\nfrom the internet without user's consent, causing ethical concerns that can\nprohibit its use without proper releases. In rare cases, data is collected in a\ncontrolled environment with consent, however, this process is time-consuming,\nexpensive, and logistically difficult to execute. This creates a barrier for\nthose unable to conjure the immense resources required to gather ethically\nsourced evaluation datasets. To address these challenges, we introduce the\nSynthetic Identity Generation pipeline, or SIG, that allows for the targeted\ncreation of ethical, balanced datasets for face recognition evaluation. Our\nproposed and demonstrated pipeline generates high-quality images of synthetic\nidentities with controllable pose, facial features, and demographic attributes,\nsuch as race, gender, and age. We also release an open-source evaluation\ndataset named ControlFace10k, consisting of 10,008 face images of 3,336 unique\nsynthetic identities balanced across race, gender, and age, generated using the\nproposed SIG pipeline. We analyze ControlFace10k along with a non-synthetic\nBUPT dataset using state-of-the-art face recognition algorithms to demonstrate\nits effectiveness as an evaluation tool. This analysis highlights the dataset's\ncharacteristics and its utility in assessing algorithmic bias across different\ndemographic groups.\n","authors":["Kassi Nzalasse","Rishav Raj","Eli Laird","Corey Clark"],"pdf_url":"https://arxiv.org/pdf/2409.08345v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11406v1","updated":"2024-09-17T17:59:33Z","published":"2024-09-17T17:59:33Z","title":"Phidias: A Generative Model for Creating 3D Content from Text, Image,\n and 3D Conditions with Reference-Augmented Diffusion","summary":" In 3D modeling, designers often use an existing 3D model as a reference to\ncreate new ones. This practice has inspired the development of Phidias, a novel\ngenerative model that uses diffusion for reference-augmented 3D generation.\nGiven an image, our method leverages a retrieved or user-provided 3D reference\nmodel to guide the generation process, thereby enhancing the generation\nquality, generalization ability, and controllability. Our model integrates\nthree key components: 1) meta-ControlNet that dynamically modulates the\nconditioning strength, 2) dynamic reference routing that mitigates misalignment\nbetween the input image and 3D reference, and 3) self-reference augmentations\nthat enable self-supervised training with a progressive curriculum.\nCollectively, these designs result in a clear improvement over existing\nmethods. Phidias establishes a unified framework for 3D generation using text,\nimage, and 3D conditions with versatile applications.\n","authors":["Zhenwei Wang","Tengfei Wang","Zexin He","Gerhard Hancke","Ziwei Liu","Rynson W. H. Lau"],"pdf_url":"https://arxiv.org/pdf/2409.11406v1.pdf","comment":"Project page: https://RAG-3D.github.io/"},{"id":"http://arxiv.org/abs/2409.11402v1","updated":"2024-09-17T17:59:06Z","published":"2024-09-17T17:59:06Z","title":"NVLM: Open Frontier-Class Multimodal LLMs","summary":" We introduce NVLM 1.0, a family of frontier-class multimodal large language\nmodels (LLMs) that achieve state-of-the-art results on vision-language tasks,\nrivaling the leading proprietary models (e.g., GPT-4o) and open-access models\n(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved\ntext-only performance over its LLM backbone after multimodal training. In terms\nof model design, we perform a comprehensive comparison between decoder-only\nmultimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g.,\nFlamingo). Based on the strengths and weaknesses of both approaches, we propose\na novel architecture that enhances both training efficiency and multimodal\nreasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for\ntile-based dynamic high-resolution images, which significantly boosts\nperformance on multimodal reasoning and OCR-related tasks. Regarding training\ndata, we meticulously curate and provide detailed information on our multimodal\npretraining and supervised fine-tuning datasets. Our findings indicate that\ndataset quality and task diversity are more important than scale, even during\nthe pretraining phase, across all architectures. Notably, we develop\nproduction-grade multimodality for the NVLM-1.0 models, enabling them to excel\nin vision-language tasks while maintaining and even improving text-only\nperformance compared to their LLM backbones. To achieve this, we craft and\nintegrate a high-quality text-only dataset into multimodal training, alongside\na substantial amount of multimodal math and reasoning data, leading to enhanced\nmath and coding capabilities across modalities. To advance research in the\nfield, we are releasing the model weights and will open-source the code for the\ncommunity: https://nvlm-project.github.io/.\n","authors":["Wenliang Dai","Nayeon Lee","Boxin Wang","Zhuoling Yang","Zihan Liu","Jon Barker","Tuomas Rintamaki","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2409.11402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11456v1","updated":"2024-09-17T17:48:12Z","published":"2024-09-17T17:48:12Z","title":"Two Stage Segmentation of Cervical Tumors using PocketNet","summary":" Cervical cancer remains the fourth most common malignancy amongst women\nworldwide.1 Concurrent chemoradiotherapy (CRT) serves as the mainstay\ndefinitive treatment regimen for locally advanced cervical cancers and includes\nexternal beam radiation followed by brachytherapy.2 Integral to radiotherapy\ntreatment planning is the routine contouring of both the target tumor at the\nlevel of the cervix, associated gynecologic anatomy and the adjacent organs at\nrisk (OARs). However, manual contouring of these structures is both time and\nlabor intensive and associated with known interobserver variability that can\nimpact treatment outcomes. While multiple tools have been developed to\nautomatically segment OARs and the high-risk clinical tumor volume (HR-CTV)\nusing computed tomography (CT) images,3,4,5,6 the development of deep\nlearning-based tumor segmentation tools using routine T2-weighted (T2w)\nmagnetic resonance imaging (MRI) addresses an unmet clinical need to improve\nthe routine contouring of both anatomical structures and cervical cancers,\nthereby increasing quality and consistency of radiotherapy planning. This work\napplied a novel deep-learning model (PocketNet) to segment the cervix, vagina,\nuterus, and tumor(s) on T2w MRI. The performance of the PocketNet architecture\nwas evaluated, when trained on data via 5-fold cross validation. PocketNet\nachieved a mean Dice-Sorensen similarity coefficient (DSC) exceeding 70% for\ntumor segmentation and 80% for organ segmentation. These results suggest that\nPocketNet is robust to variations in contrast protocols, providing reliable\nsegmentation of the ROIs.\n","authors":["Awj Twam","Megan Jacobsen","Rachel Glenn","Ann Klopp","Aradhana M. Venkatesan","David Fuentes"],"pdf_url":"https://arxiv.org/pdf/2409.11456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11383v1","updated":"2024-09-17T17:34:24Z","published":"2024-09-17T17:34:24Z","title":"Training Datasets Generation for Machine Learning: Application to Vision\n Based Navigation","summary":" Vision Based Navigation consists in utilizing cameras as precision sensors\nfor GNC after extracting information from images. To enable the adoption of\nmachine learning for space applications, one of obstacles is the demonstration\nthat available training datasets are adequate to validate the algorithms. The\nobjective of the study is to generate datasets of images and metadata suitable\nfor training machine learning algorithms. Two use cases were selected and a\nrobust methodology was developed to validate the datasets including the ground\ntruth. The first use case is in-orbit rendezvous with a man-made object: a\nmockup of satellite ENVISAT. The second use case is a Lunar landing scenario.\nDatasets were produced from archival datasets (Chang'e 3), from the laboratory\nat DLR TRON facility and at Airbus Robotic laboratory, from SurRender software\nhigh fidelity image simulator using Model Capture and from Generative\nAdversarial Networks. The use case definition included the selection of\nalgorithms as benchmark: an AI-based pose estimation algorithm and a dense\noptical flow algorithm were selected. Eventually it is demonstrated that\ndatasets produced with SurRender and selected laboratory facilities are\nadequate to train machine learning algorithms.\n","authors":["Jérémy Lebreton","Ingo Ahrns","Roland Brochard","Christoph Haskamp","Matthieu Le Goff","Nicolas Menga","Nicolas Ollagnier","Ralf Regele","Francesco Capolupo","Massimo Casasco"],"pdf_url":"https://arxiv.org/pdf/2409.11383v1.pdf","comment":"6 pages, 4 figures, preprint of the proceedings of ESA SPAICE\n conference 2024"},{"id":"http://arxiv.org/abs/2408.08567v3","updated":"2024-09-17T17:30:46Z","published":"2024-08-16T07:01:46Z","title":"S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton\n Sketching","summary":" Attention based models have achieved many remarkable breakthroughs in\nnumerous applications. However, the quadratic complexity of Attention makes the\nvanilla Attention based models hard to apply to long sequence tasks. Various\nimproved Attention structures are proposed to reduce the computation cost by\ninducing low rankness and approximating the whole sequence by sub-sequences.\nThe most challenging part of those approaches is maintaining the proper balance\nbetween information preservation and computation reduction: the longer\nsub-sequences used, the better information is preserved, but at the price of\nintroducing more noise and computational costs. In this paper, we propose a\nsmoothed skeleton sketching based Attention structure, coined S$^3$Attention,\nwhich significantly improves upon the previous attempts to negotiate this\ntrade-off. S$^3$Attention has two mechanisms to effectively minimize the impact\nof noise while keeping the linear complexity to the sequence length: a\nsmoothing block to mix information over long sequences and a matrix sketching\nmethod that simultaneously selects columns and rows from the input matrix. We\nverify the effectiveness of S$^3$Attention both theoretically and empirically.\nExtensive studies over Long Range Arena (LRA) datasets and six time-series\nforecasting show that S$^3$Attention significantly outperforms both vanilla\nAttention and other state-of-the-art variants of Attention structures.\n","authors":["Xue Wang","Tian Zhou","Jianqing Zhu","Jialin Liu","Kun Yuan","Tao Yao","Wotao Yin","Rong Jin","HanQin Cai"],"pdf_url":"https://arxiv.org/pdf/2408.08567v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11380v1","updated":"2024-09-17T17:29:33Z","published":"2024-09-17T17:29:33Z","title":"Ultrasound Image Enhancement with the Variance of Diffusion Models","summary":" Ultrasound imaging, despite its widespread use in medicine, often suffers\nfrom various sources of noise and artifacts that impact the signal-to-noise\nratio and overall image quality. Enhancing ultrasound images requires a\ndelicate balance between contrast, resolution, and speckle preservation. This\npaper introduces a novel approach that integrates adaptive beamforming with\ndenoising diffusion-based variance imaging to address this challenge. By\napplying Eigenspace-Based Minimum Variance (EBMV) beamforming and employing a\ndenoising diffusion model fine-tuned on ultrasound data, our method computes\nthe variance across multiple diffusion-denoised samples to produce high-quality\ndespeckled images. This approach leverages both the inherent multiplicative\nnoise of ultrasound and the stochastic nature of diffusion models. Experimental\nresults on a publicly available dataset demonstrate the effectiveness of our\nmethod in achieving superior image reconstructions from single plane-wave\nacquisitions. The code is available at:\nhttps://github.com/Yuxin-Zhang-Jasmine/IUS2024_Diffusion.\n","authors":["Yuxin Zhang","Clément Huneau","Jérôme Idier","Diana Mateus"],"pdf_url":"https://arxiv.org/pdf/2409.11380v1.pdf","comment":"Accepted by the IEEE International Ultrasonics Symposium (IUS) 2024"},{"id":"http://arxiv.org/abs/2409.11375v1","updated":"2024-09-17T17:22:35Z","published":"2024-09-17T17:22:35Z","title":"Multi-OCT-SelfNet: Integrating Self-Supervised Learning with\n Multi-Source Data Fusion for Enhanced Multi-Class Retinal Disease\n Classification","summary":" In the medical domain, acquiring large datasets poses significant challenges\ndue to privacy concerns. Nonetheless, the development of a robust deep-learning\nmodel for retinal disease diagnosis necessitates a substantial dataset for\ntraining. The capacity to generalize effectively on smaller datasets remains a\npersistent challenge. The scarcity of data presents a significant barrier to\nthe practical implementation of scalable medical AI solutions. To address this\nissue, we've combined a wide range of data sources to improve performance and\ngeneralization to new data by giving it a deeper understanding of the data\nrepresentation from multi-modal datasets and developed a self-supervised\nframework based on large language models (LLMs), SwinV2 to gain a deeper\nunderstanding of multi-modal dataset representations, enhancing the model's\nability to extrapolate to new data for the detection of eye diseases using\noptical coherence tomography (OCT) images. We adopt a two-phase training\nmethodology, self-supervised pre-training, and fine-tuning on a downstream\nsupervised classifier. An ablation study conducted across three datasets\nemploying various encoder backbones, without data fusion, with low data\navailability setting, and without self-supervised pre-training scenarios,\nhighlights the robustness of our method. Our findings demonstrate consistent\nperformance across these diverse conditions, showcasing superior generalization\ncapabilities compared to the baseline model, ResNet-50.\n","authors":["Fatema-E- Jannat","Sina Gholami","Jennifer I. Lim","Theodore Leng","Minhaj Nur Alam","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2409.11375v1.pdf","comment":"25 pages, 9 tables, 10 figures"},{"id":"http://arxiv.org/abs/2409.11373v1","updated":"2024-09-17T17:20:21Z","published":"2024-09-17T17:20:21Z","title":"Uncertainty and Prediction Quality Estimation for Semantic Segmentation\n via Graph Neural Networks","summary":" When employing deep neural networks (DNNs) for semantic segmentation in\nsafety-critical applications like automotive perception or medical imaging, it\nis important to estimate their performance at runtime, e.g. via uncertainty\nestimates or prediction quality estimates. Previous works mostly performed\nuncertainty estimation on pixel-level. In a line of research, a\nconnected-component-wise (segment-wise) perspective was taken, approaching\nuncertainty estimation on an object-level by performing so-called meta\nclassification and regression to estimate uncertainty and prediction quality,\nrespectively. In those works, each predicted segment is considered individually\nto estimate its uncertainty or prediction quality. However, the neighboring\nsegments may provide additional hints on whether a given predicted segment is\nof high quality, which we study in the present work. On the basis of\nuncertainty indicating metrics on segment-level, we use graph neural networks\n(GNNs) to model the relationship of a given segment's quality as a function of\nthe given segment's metrics as well as those of its neighboring segments. We\ncompare different GNN architectures and achieve a notable performance\nimprovement.\n","authors":["Edgar Heinert","Stephan Tilgner","Timo Palm","Matthias Rottmann"],"pdf_url":"https://arxiv.org/pdf/2409.11373v1.pdf","comment":"11 pages, 3 figures, submitted to BMVC \"Workshop on Robust\n Recognition in the Open World\" (https://rrow2024.github.io/call-for-papers)"},{"id":"http://arxiv.org/abs/2409.11370v1","updated":"2024-09-17T17:18:57Z","published":"2024-09-17T17:18:57Z","title":"Compact Implicit Neural Representations for Plane Wave Images","summary":" Ultrafast Plane-Wave (PW) imaging often produces artifacts and shadows that\nvary with insonification angles. We propose a novel approach using Implicit\nNeural Representations (INRs) to compactly encode multi-planar sequences while\npreserving crucial orientation-dependent information. To our knowledge, this is\nthe first application of INRs for PW angular interpolation. Our method employs\na Multi-Layer Perceptron (MLP)-based model with a concise physics-enhanced\nrendering technique. Quantitative evaluations using SSIM, PSNR, and standard\nultrasound metrics, along with qualitative visual assessments, confirm the\neffectiveness of our approach. Additionally, our method demonstrates\nsignificant storage efficiency, with model weights requiring 530 KB compared to\n8 MB for directly storing the 75 PW images, achieving a notable compression\nratio of approximately 15:1.\n","authors":["Mathilde Monvoisin","Yuxin Zhang","Diana Mateus"],"pdf_url":"https://arxiv.org/pdf/2409.11370v1.pdf","comment":"Accepted by the IEEE International Ultrasonics Symposium (IUS) 2024"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.11598v1","updated":"2024-09-17T23:10:04Z","published":"2024-09-17T23:10:04Z","title":"Towards Fair RAG: On the Impact of Fair Ranking in Retrieval-Augmented\n Generation","summary":" Many language models now enhance their responses with retrieval capabilities,\nleading to the widespread adoption of retrieval-augmented generation (RAG)\nsystems. However, despite retrieval being a core component of RAG, much of the\nresearch in this area overlooks the extensive body of work on fair ranking,\nneglecting the importance of considering all stakeholders involved. This paper\npresents the first systematic evaluation of RAG systems integrated with fair\nrankings. We focus specifically on measuring the fair exposure of each relevant\nitem across the rankings utilized by RAG systems (i.e., item-side fairness),\naiming to promote equitable growth for relevant item providers. To gain a deep\nunderstanding of the relationship between item-fairness, ranking quality, and\ngeneration quality in the context of RAG, we analyze nine different RAG systems\nthat incorporate fair rankings across seven distinct datasets. Our findings\nindicate that RAG systems with fair rankings can maintain a high level of\ngeneration quality and, in many cases, even outperform traditional RAG systems,\ndespite the general trend of a tradeoff between ensuring fairness and\nmaintaining system-effectiveness. We believe our insights lay the groundwork\nfor responsible and equitable RAG systems and open new avenues for future\nresearch. We publicly release our codebase and dataset at\nhttps://github.com/kimdanny/Fair-RAG.\n","authors":["To Eun Kim","Fernando Diaz"],"pdf_url":"https://arxiv.org/pdf/2409.11598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11511v1","updated":"2024-09-17T19:25:58Z","published":"2024-09-17T19:25:58Z","title":"A Framework for Ranking Content Providers Using Prompt Engineering and\n Self-Attention Network","summary":" This paper addresses the problem of ranking Content Providers for Content\nRecommendation System. Content Providers are the sources of news and other\ntypes of content, such as lifestyle, travel, gardening. We propose a framework\nthat leverages explicit user feedback, such as clicks and reactions, and\ncontent-based features, such as writing style and frequency of publishing, to\nrank Content Providers for a given topic. We also use language models to\nengineer prompts that help us create a ground truth dataset for the previous\nunsupervised ranking problem. Using this ground truth, we expand with a\nself-attention based network to train on Learning to Rank ListWise task. We\nevaluate our framework using online experiments and show that it can improve\nthe quality, credibility, and diversity of the content recommended to users.\n","authors":["Gosuddin Kamaruddin Siddiqi","Deven Santhosh Shah","Radhika Bansal","Askar Kamalov"],"pdf_url":"https://arxiv.org/pdf/2409.11511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11505v1","updated":"2024-09-17T19:17:57Z","published":"2024-09-17T19:17:57Z","title":"Perceptions of Edinburgh: Capturing Neighbourhood Characteristics by\n Clustering Geoparsed Local News","summary":" The communities that we live in affect our health in ways that are complex\nand hard to define. Moreover, our understanding of the place-based processes\naffecting health and inequalities is limited. This undermines the development\nof robust policy interventions to improve local health and well-being. News\nmedia provides social and community information that may be useful in health\nstudies. Here we propose a methodology for characterising neighbourhoods by\nusing local news articles. More specifically, we show how we can use Natural\nLanguage Processing (NLP) to unlock further information about neighbourhoods by\nanalysing, geoparsing and clustering news articles. Our work is novel because\nwe combine street-level geoparsing tailored to the locality with clustering of\nfull news articles, enabling a more detailed examination of neighbourhood\ncharacteristics. We evaluate our outputs and show via a confluence of evidence,\nboth from a qualitative and a quantitative perspective, that the themes we\nextract from news articles are sensible and reflect many characteristics of the\nreal world. This is significant because it allows us to better understand the\neffects of neighbourhoods on health. Our findings on neighbourhood\ncharacterisation using news data will support a new generation of place-based\nresearch which examines a wider set of spatial processes and how they affect\nhealth, enabling new epidemiological research.\n","authors":["Andreas Grivas","Claire Grover","Richard Tobin","Clare Llewellyn","Eleojo Oluwaseun Abubakar","Chunyu Zheng","Chris Dibben","Alan Marshall","Jamie Pearce","Beatrice Alex"],"pdf_url":"https://arxiv.org/pdf/2409.11505v1.pdf","comment":"Preprint - paper under submission"},{"id":"http://arxiv.org/abs/2403.18604v3","updated":"2024-09-17T16:08:05Z","published":"2024-03-27T14:24:28Z","title":"Modeling Sustainable City Trips: Integrating CO2e Emissions, Popularity,\n and Seasonality into Tourism Recommender Systems","summary":" Tourism affects not only the tourism industry but also society and\nstakeholders such as the environment, local businesses, and residents. Tourism\nRecommender Systems (TRS) can be pivotal in promoting sustainable tourism by\nguiding travelers toward destinations with minimal negative impact. Our paper\nintroduces a composite sustainability indicator for a city trip TRS based on\nthe users' starting point and month of travel. This indicator integrates CO2e\nemissions for different transportation modes and analyses destination\npopularity and seasonal demand. We quantify city popularity based on user\nreviews, points of interest, and search trends from Tripadvisor and Google\nTrends data. To calculate a seasonal demand index, we leverage data from\nTourMIS and Airbnb. We conducted a user study to explore the fundamental\ntrade-offs in travel decision-making and determine the weights for our proposed\nindicator. Finally, we demonstrate the integration of this indicator into a\nTRS, illustrating its ability to deliver sustainable city trip recommendations.\nThis work lays the foundation for future research by integrating sustainability\nmeasures and contributing to responsible recommendations by TRS.\n","authors":["Ashmi Banerjee","Tunar Mahmudov","Emil Adler","Fitri Nur Aisyah","Wolfgang Wörndl"],"pdf_url":"https://arxiv.org/pdf/2403.18604v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11281v1","updated":"2024-09-17T15:37:51Z","published":"2024-09-17T15:37:51Z","title":"Beyond Relevance: Improving User Engagement by Personalization for\n Short-Video Search","summary":" Personalized search has been extensively studied in various applications,\nincluding web search, e-commerce, social networks, etc. With the soaring\npopularity of short-video platforms, exemplified by TikTok and Kuaishou, the\nquestion arises: can personalization elevate the realm of short-video search,\nand if so, which techniques hold the key?\n In this work, we introduce $\\text{PR}^2$, a novel and comprehensive solution\nfor personalizing short-video search, where $\\text{PR}^2$ stands for the\nPersonalized Retrieval and Ranking augmented search system. Specifically,\n$\\text{PR}^2$ leverages query-relevant collaborative filtering and personalized\ndense retrieval to extract relevant and individually tailored content from a\nlarge-scale video corpus. Furthermore, it utilizes the QIN (Query-Dominate User\nInterest Network) ranking model, to effectively harness user long-term\npreferences and real-time behaviors, and efficiently learn from user various\nimplicit feedback through a multi-task learning framework. By deploying the\n$\\text{PR}^2$ in production system, we have achieved the most remarkable user\nengagement improvements in recent years: a 10.2% increase in CTR@10, a notable\n20% surge in video watch time, and a 1.6% uplift of search DAU. We believe the\npractical insights presented in this work are valuable especially for building\nand improving personalized search systems for the short video platforms.\n","authors":["Wentian Bao","Hu Liu","Kai Zheng","Chao Zhang","Shunyu Zhang","Enyun Yu","Wenwu Ou","Yang Song"],"pdf_url":"https://arxiv.org/pdf/2409.11281v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11279v1","updated":"2024-09-17T15:29:34Z","published":"2024-09-17T15:29:34Z","title":"P-RAG: Progressive Retrieval Augmented Generation For Planning on\n Embodied Everyday Task","summary":" Embodied Everyday Task is a popular task in the embodied AI community,\nrequiring agents to make a sequence of actions based on natural language\ninstructions and visual observations. Traditional learning-based approaches\nface two challenges. Firstly, natural language instructions often lack explicit\ntask planning. Secondly, extensive training is required to equip models with\nknowledge of the task environment. Previous works based on Large Language Model\n(LLM) either suffer from poor performance due to the lack of task-specific\nknowledge or rely on ground truth as few-shot samples. To address the above\nlimitations, we propose a novel approach called Progressive Retrieval Augmented\nGeneration (P-RAG), which not only effectively leverages the powerful language\nprocessing capabilities of LLMs but also progressively accumulates\ntask-specific knowledge without ground-truth. Compared to the conventional RAG\nmethods, which retrieve relevant information from the database in a one-shot\nmanner to assist generation, P-RAG introduces an iterative approach to\nprogressively update the database. In each iteration, P-RAG retrieves the\nlatest database and obtains historical information from the previous\ninteraction as experiential references for the current interaction. Moreover,\nwe also introduce a more granular retrieval scheme that not only retrieves\nsimilar tasks but also incorporates retrieval of similar situations to provide\nmore valuable reference experiences. Extensive experiments reveal that P-RAG\nachieves competitive results without utilizing ground truth and can even\nfurther improve performance through self-iterations.\n","authors":["Weiye Xu","Min Wang","Wengang Zhou","Houqiang Li"],"pdf_url":"https://arxiv.org/pdf/2409.11279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11449v1","updated":"2024-09-17T14:44:49Z","published":"2024-09-17T14:44:49Z","title":"Evaluation of pretrained language models on music understanding","summary":" Music-text multimodal systems have enabled new approaches to Music\nInformation Research (MIR) applications such as audio-to-text and text-to-audio\nretrieval, text-based song generation, and music captioning. Despite the\nreported success, little effort has been put into evaluating the musical\nknowledge of Large Language Models (LLM). In this paper, we demonstrate that\nLLMs suffer from 1) prompt sensitivity, 2) inability to model negation (e.g.\n'rock song without guitar'), and 3) sensitivity towards the presence of\nspecific words. We quantified these properties as a triplet-based accuracy,\nevaluating the ability to model the relative similarity of labels in a\nhierarchical ontology. We leveraged the Audioset ontology to generate triplets\nconsisting of an anchor, a positive (relevant) label, and a negative (less\nrelevant) label for the genre and instruments sub-tree. We evaluated the\ntriplet-based musical knowledge for six general-purpose Transformer-based\nmodels. The triplets obtained through this methodology required filtering, as\nsome were difficult to judge and therefore relatively uninformative for\nevaluation purposes. Despite the relatively high accuracy reported,\ninconsistencies are evident in all six models, suggesting that off-the-shelf\nLLMs need adaptation to music before use.\n","authors":["Yannis Vasilakis","Rachel Bittner","Johan Pauwels"],"pdf_url":"https://arxiv.org/pdf/2409.11449v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11136v1","updated":"2024-09-17T12:42:55Z","published":"2024-09-17T12:42:55Z","title":"Promptriever: Instruction-Trained Retrievers Can Be Prompted Like\n Language Models","summary":" Instruction-tuned language models (LM) are able to respond to imperative\ncommands, providing a more natural user interface compared to their base\ncounterparts. In this work, we present Promptriever, the first retrieval model\nable to be prompted like an LM. To train Promptriever, we curate and release a\nnew instance-level instruction training set from MS MARCO, spanning nearly 500k\ninstances. Promptriever not only achieves strong performance on standard\nretrieval tasks, but also follows instructions. We observe: (1) large gains\n(reaching SoTA) on following detailed relevance instructions (+14.3 p-MRR /\n+3.1 nDCG on FollowIR), (2) significantly increased robustness to lexical\nchoices/phrasing in the query+instruction (+12.9 Robustness@10 on InstructIR),\nand (3) the ability to perform hyperparameter search via prompting to reliably\nimprove retrieval performance (+1.4 average increase on BEIR). Promptriever\ndemonstrates that retrieval models can be controlled with prompts on a\nper-query basis, setting the stage for future work aligning LM prompting\ntechniques with information retrieval.\n","authors":["Orion Weller","Benjamin Van Durme","Dawn Lawrie","Ashwin Paranjape","Yuhao Zhang","Jack Hessel"],"pdf_url":"https://arxiv.org/pdf/2409.11136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10993v1","updated":"2024-09-17T08:55:50Z","published":"2024-09-17T08:55:50Z","title":"Multi-modal Generative Models in Recommendation System","summary":" Many recommendation systems limit user inputs to text strings or behavior\nsignals such as clicks and purchases, and system outputs to a list of products\nsorted by relevance. With the advent of generative AI, users have come to\nexpect richer levels of interactions. In visual search, for example, a user may\nprovide a picture of their desired product along with a natural language\nmodification of the content of the picture (e.g., a dress like the one shown in\nthe picture but in red color). Moreover, users may want to better understand\nthe recommendations they receive by visualizing how the product fits their use\ncase, e.g., with a representation of how a garment might look on them, or how a\nfurniture item might look in their room. Such advanced levels of interaction\nrequire recommendation systems that are able to discover both shared and\ncomplementary information about the product across modalities, and visualize\nthe product in a realistic and informative way. However, existing systems often\ntreat multiple modalities independently: text search is usually done by\ncomparing the user query to product titles and descriptions, while visual\nsearch is typically done by comparing an image provided by the customer to\nproduct images. We argue that future recommendation systems will benefit from a\nmulti-modal understanding of the products that leverages the rich information\nretailers have about both customers and products to come up with the best\nrecommendations. In this chapter we review recommendation systems that use\nmultiple data modalities simultaneously.\n","authors":["Arnau Ramisa","Rene Vidal","Yashar Deldjoo","Zhankui He","Julian McAuley","Anton Korikov","Scott Sanner","Mahesh Sathiamoorthy","Atoosa Kasrizadeh","Silvia Milano","Francesco Ricci"],"pdf_url":"https://arxiv.org/pdf/2409.10993v1.pdf","comment":"32 pages 5 figures"},{"id":"http://arxiv.org/abs/2409.10949v1","updated":"2024-09-17T07:36:21Z","published":"2024-09-17T07:36:21Z","title":"Inside Alameda Research: A Multi-Token Network Analysis","summary":" We analyze the token transfer network on Ethereum, focusing on accounts\nassociated with Alameda Research, a cryptocurrency trading firm implicated in\nthe misuse of FTX customer funds. Using a multi-token network representation,\nwe examine node centralities and the network backbone to identify critical\naccounts, tokens, and activity groups. The temporal evolution of Alameda\naccounts reveals shifts in token accumulation and distribution patterns leading\nup to its bankruptcy in November 2022. Through network analysis, our work\noffers insights into the activities and dynamics that shape the DeFi ecosystem.\n","authors":["Célestin Coquidé","Rémy Cazabet","Natkamon Tovanich"],"pdf_url":"https://arxiv.org/pdf/2409.10949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10909v1","updated":"2024-09-17T05:59:32Z","published":"2024-09-17T05:59:32Z","title":"GenCRF: Generative Clustering and Reformulation Framework for Enhanced\n Intent-Driven Information Retrieval","summary":" Query reformulation is a well-known problem in Information Retrieval (IR)\naimed at enhancing single search successful completion rate by automatically\nmodifying user's input query. Recent methods leverage Large Language Models\n(LLMs) to improve query reformulation, but often generate limited and redundant\nexpansions, potentially constraining their effectiveness in capturing diverse\nintents. In this paper, we propose GenCRF: a Generative Clustering and\nReformulation Framework to capture diverse intentions adaptively based on\nmultiple differentiated, well-generated queries in the retrieval phase for the\nfirst time. GenCRF leverages LLMs to generate variable queries from the initial\nquery using customized prompts, then clusters them into groups to distinctly\nrepresent diverse intents. Furthermore, the framework explores to combine\ndiverse intents query with innovative weighted aggregation strategies to\noptimize retrieval performance and crucially integrates a novel Query\nEvaluation Rewarding Model (QERM) to refine the process through feedback loops.\nEmpirical experiments on the BEIR benchmark demonstrate that GenCRF achieves\nstate-of-the-art performance, surpassing previous query reformulation SOTAs by\nup to 12% on nDCG@10. These techniques can be adapted to various LLMs,\nsignificantly boosting retriever performance and advancing the field of\nInformation Retrieval.\n","authors":["Wonduk Seo","Haojie Zhang","Yueyang Zhang","Changhao Zhang","Songyao Duan","Lixin Su","Daiting Shi","Jiashu Zhao","Dawei Yin"],"pdf_url":"https://arxiv.org/pdf/2409.10909v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10907v1","updated":"2024-09-17T05:54:25Z","published":"2024-09-17T05:54:25Z","title":"Attention-Seeker: Dynamic Self-Attention Scoring for Unsupervised\n Keyphrase Extraction","summary":" This paper proposes Attention-Seeker, an unsupervised keyphrase extraction\nmethod that leverages self-attention maps from a Large Language Model to\nestimate the importance of candidate phrases. Our approach identifies specific\ncomponents - such as layers, heads, and attention vectors - where the model\npays significant attention to the key topics of the text. The attention weights\nprovided by these components are then used to score the candidate phrases.\nUnlike previous models that require manual tuning of parameters (e.g.,\nselection of heads, prompts, hyperparameters), Attention-Seeker dynamically\nadapts to the input text without any manual adjustments, enhancing its\npractical applicability. We evaluate Attention-Seeker on four publicly\navailable datasets: Inspec, SemEval2010, SemEval2017, and Krapivin. Our results\ndemonstrate that, even without parameter tuning, Attention-Seeker outperforms\nmost baseline models, achieving state-of-the-art performance on three out of\nfour datasets, particularly excelling in extracting keyphrases from long\ndocuments.\n","authors":["Erwin D. López Z.","Cheng Tang","Atsushi Shimada"],"pdf_url":"https://arxiv.org/pdf/2409.10907v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10825v1","updated":"2024-09-17T01:37:57Z","published":"2024-09-17T01:37:57Z","title":"Challenging Fairness: A Comprehensive Exploration of Bias in LLM-Based\n Recommendations","summary":" Large Language Model (LLM)-based recommendation systems provide more\ncomprehensive recommendations than traditional systems by deeply analyzing\ncontent and user behavior. However, these systems often exhibit biases,\nfavoring mainstream content while marginalizing non-traditional options due to\nskewed training data. This study investigates the intricate relationship\nbetween bias and LLM-based recommendation systems, with a focus on music, song,\nand book recommendations across diverse demographic and cultural groups.\nThrough a comprehensive analysis conducted over different LLM-models, this\npaper evaluates the impact of bias on recommendation outcomes. Our findings\nreveal that bias is so deeply ingrained within these systems that even a\nsimpler intervention like prompt engineering can significantly reduce bias,\nunderscoring the pervasive nature of the issue. Moreover, factors like\nintersecting identities and contextual information, such as socioeconomic\nstatus, further amplify these biases, demonstrating the complexity and depth of\nthe challenges faced in creating fair recommendations across different groups.\n","authors":["Shahnewaz Karim Sakib","Anindya Bijoy Das"],"pdf_url":"https://arxiv.org/pdf/2409.10825v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.07730v2","updated":"2024-09-17T00:48:38Z","published":"2024-09-12T03:33:19Z","title":"Music auto-tagging in the long tail: A few-shot approach","summary":" In the realm of digital music, using tags to efficiently organize and\nretrieve music from extensive databases is crucial for music catalog owners.\nHuman tagging by experts is labor-intensive but mostly accurate, whereas\nautomatic tagging through supervised learning has approached satisfying\naccuracy but is restricted to a predefined set of training tags. Few-shot\nlearning offers a viable solution to expand beyond this small set of predefined\ntags by enabling models to learn from only a few human-provided examples to\nunderstand tag meanings and subsequently apply these tags autonomously. We\npropose to integrate few-shot learning methodology into multi-label music\nauto-tagging by using features from pre-trained models as inputs to a\nlightweight linear classifier, also known as a linear probe. We investigate\ndifferent popular pre-trained features, as well as different few-shot\nparametrizations with varying numbers of classes and samples per class. Our\nexperiments demonstrate that a simple model with pre-trained features can\nachieve performance close to state-of-the-art models while using significantly\nless training data, such as 20 samples per tag. Additionally, our linear probe\nperforms competitively with leading models when trained on the entire training\ndataset. The results show that this transfer learning-based few-shot approach\ncould effectively address the issue of automatically assigning long-tail tags\nwith only limited labeled data.\n","authors":["T. Aleksandra Ma","Alexander Lerch"],"pdf_url":"https://arxiv.org/pdf/2409.07730v2.pdf","comment":"Published in Audio Engineering Society NY Show 2024 as a Peer\n Reviewed (Category 1) paper; typos corrected"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2409.11609v1","updated":"2024-09-17T23:53:34Z","published":"2024-09-17T23:53:34Z","title":"Time-Series Forecasting, Knowledge Distillation, and Refinement within a\n Multimodal PDE Foundation Model","summary":" Symbolic encoding has been used in multi-operator learning as a way to embed\nadditional information for distinct time-series data. For spatiotemporal\nsystems described by time-dependent partial differential equations, the\nequation itself provides an additional modality to identify the system. The\nutilization of symbolic expressions along side time-series samples allows for\nthe development of multimodal predictive neural networks. A key challenge with\ncurrent approaches is that the symbolic information, i.e. the equations, must\nbe manually preprocessed (simplified, rearranged, etc.) to match and relate to\nthe existing token library, which increases costs and reduces flexibility,\nespecially when dealing with new differential equations. We propose a new token\nlibrary based on SymPy to encode differential equations as an additional\nmodality for time-series models. The proposed approach incurs minimal cost, is\nautomated, and maintains high prediction accuracy for forecasting tasks.\nAdditionally, we include a Bayesian filtering module that connects the\ndifferent modalities to refine the learned equation. This improves the accuracy\nof the learned symbolic representation and the predicted time-series.\n","authors":["Derek Jollie","Jingmin Sun","Zecheng Zhang","Hayden Schaeffer"],"pdf_url":"https://arxiv.org/pdf/2409.11609v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11601v1","updated":"2024-09-17T23:20:05Z","published":"2024-09-17T23:20:05Z","title":"DiffESM: Conditional Emulation of Temperature and Precipitation in Earth\n System Models with 3D Diffusion Models","summary":" Earth System Models (ESMs) are essential for understanding the interaction\nbetween human activities and the Earth's climate. However, the computational\ndemands of ESMs often limit the number of simulations that can be run,\nhindering the robust analysis of risks associated with extreme weather events.\nWhile low-cost climate emulators have emerged as an alternative to emulate ESMs\nand enable rapid analysis of future climate, many of these emulators only\nprovide output on at most a monthly frequency. This temporal resolution is\ninsufficient for analyzing events that require daily characterization, such as\nheat waves or heavy precipitation. We propose using diffusion models, a class\nof generative deep learning models, to effectively downscale ESM output from a\nmonthly to a daily frequency. Trained on a handful of ESM realizations,\nreflecting a wide range of radiative forcings, our DiffESM model takes monthly\nmean precipitation or temperature as input, and is capable of producing daily\nvalues with statistical characteristics close to ESM output. Combined with a\nlow-cost emulator providing monthly means, this approach requires only a small\nfraction of the computational resources needed to run a large ensemble. We\nevaluate model behavior using a number of extreme metrics, showing that DiffESM\nclosely matches the spatio-temporal behavior of the ESM output it emulates in\nterms of the frequency and spatial characteristics of phenomena such as heat\nwaves, dry spells, or rainfall intensity.\n","authors":["Seth Bassetti","Brian Hutchinson","Claudia Tebaldi","Ben Kravitz"],"pdf_url":"https://arxiv.org/pdf/2409.11601v1.pdf","comment":"Accepted for publication in Journal of Advances in Modeling Earth\n Systems"},{"id":"http://arxiv.org/abs/2409.11600v1","updated":"2024-09-17T23:15:39Z","published":"2024-09-17T23:15:39Z","title":"No Saved Kaleidosope: an 100% Jitted Neural Network Coding Language with\n Pythonic Syntax","summary":" We developed a jitted compiler for training Artificial Neural Networks using\nC++, LLVM and Cuda. It features object-oriented characteristics, strong typing,\nparallel workers for data pre-processing, pythonic syntax for expressions,\nPyTorch like model declaration and Automatic Differentiation. We implement the\nmechanisms of cache and pooling in order to manage VRAM, cuBLAS for high\nperformance matrix multiplication and cuDNN for convolutional layers. Our\nexperiments with Residual Convolutional Neural Networks on ImageNet, we reach\nsimilar speed but degraded performance. Also, the GRU network experiments show\nsimilar accuracy, but our compiler have degraded speed in that task. However,\nour compiler demonstrates promising results at the CIFAR-10 benchmark, in which\nwe reach the same performance and about the same speed as PyTorch. We make the\ncode publicly available at: https://github.com/NoSavedDATA/NoSavedKaleidoscope\n","authors":["Augusto Seben da Rosa","Marlon Daniel Angeli","Jorge Aikes Junior","Alef Iury Ferreira","Lucas Rafael Gris","Anderson da Silva Soares","Arnaldo Candido Junior","Frederico Santos de Oliveira","Gabriel Trevisan Damke","Rafael Teixeira Sousa"],"pdf_url":"https://arxiv.org/pdf/2409.11600v1.pdf","comment":"12 pages, 3 figures and 3 tables"},{"id":"http://arxiv.org/abs/2409.11597v1","updated":"2024-09-17T23:09:25Z","published":"2024-09-17T23:09:25Z","title":"The Sample Complexity of Smooth Boosting and the Tightness of the\n Hardcore Theorem","summary":" Smooth boosters generate distributions that do not place too much weight on\nany given example. Originally introduced for their noise-tolerant properties,\nsuch boosters have also found applications in differential privacy,\nreproducibility, and quantum learning theory. We study and settle the sample\ncomplexity of smooth boosting: we exhibit a class that can be weak learned to\n$\\gamma$-advantage over smooth distributions with $m$ samples, for which strong\nlearning over the uniform distribution requires\n$\\tilde{\\Omega}(1/\\gamma^2)\\cdot m$ samples. This matches the overhead of\nexisting smooth boosters and provides the first separation from the setting of\ndistribution-independent boosting, for which the corresponding overhead is\n$O(1/\\gamma)$.\n Our work also sheds new light on Impagliazzo's hardcore theorem from\ncomplexity theory, all known proofs of which can be cast in the framework of\nsmooth boosting. For a function $f$ that is mildly hard against size-$s$\ncircuits, the hardcore theorem provides a set of inputs on which $f$ is\nextremely hard against size-$s'$ circuits. A downside of this important result\nis the loss in circuit size, i.e. that $s' \\ll s$. Answering a question of\nTrevisan, we show that this size loss is necessary and in fact, the parameters\nachieved by known proofs are the best possible.\n","authors":["Guy Blanc","Alexandre Hayderi","Caleb Koch","Li-Yang Tan"],"pdf_url":"https://arxiv.org/pdf/2409.11597v1.pdf","comment":"46 pages, FOCS 2024"},{"id":"http://arxiv.org/abs/2409.11596v1","updated":"2024-09-17T23:08:53Z","published":"2024-09-17T23:08:53Z","title":"Outlier Detection with Cluster Catch Digraphs","summary":" This paper introduces a novel family of outlier detection algorithms based on\nCluster Catch Digraphs (CCDs), specifically tailored to address the challenges\nof high dimensionality and varying cluster shapes, which deteriorate the\nperformance of most traditional outlier detection methods. We propose the\nUniformity-Based CCD with Mutual Catch Graph (U-MCCD), the Uniformity- and\nNeighbor-Based CCD with Mutual Catch Graph (UN-MCCD), and their shape-adaptive\nvariants (SU-MCCD and SUN-MCCD), which are designed to detect outliers in data\nsets with arbitrary cluster shapes and high dimensions. We present the\nadvantages and shortcomings of these algorithms and provide the motivation or\nneed to define each particular algorithm. Through comprehensive Monte Carlo\nsimulations, we assess their performance and demonstrate the robustness and\neffectiveness of our algorithms across various settings and contamination\nlevels. We also illustrate the use of our algorithms on various real-life data\nsets. The U-MCCD algorithm efficiently identifies outliers while maintaining\nhigh true negative rates, and the SU-MCCD algorithm shows substantial\nimprovement in handling non-uniform clusters. Additionally, the UN-MCCD and\nSUN-MCCD algorithms address the limitations of existing methods in\nhigh-dimensional spaces by utilizing Nearest Neighbor Distances (NND) for\nclustering and outlier detection. Our results indicate that these novel\nalgorithms offer substantial advancements in the accuracy and adaptability of\noutlier detection, providing a valuable tool for various real-world\napplications.\n Keyword: Outlier detection, Graph-based clustering, Cluster catch digraphs,\n$k$-nearest-neighborhood, Mutual catch graphs, Nearest neighbor distance.\n","authors":["Rui Shi","Nedret Billor","Elvan Ceyhan"],"pdf_url":"https://arxiv.org/pdf/2409.11596v1.pdf","comment":"73 pages, 146 figures"},{"id":"http://arxiv.org/abs/2409.11593v1","updated":"2024-09-17T22:58:20Z","published":"2024-09-17T22:58:20Z","title":"Self-Contrastive Forward-Forward Algorithm","summary":" The Forward-Forward (FF) algorithm is a recent, purely forward-mode learning\nmethod, that updates weights locally and layer-wise and supports supervised as\nwell as unsupervised learning. These features make it ideal for applications\nsuch as brain-inspired learning, low-power hardware neural networks, and\ndistributed learning in large models. However, while FF has shown promise on\nwritten digit recognition tasks, its performance on natural images and\ntime-series remains a challenge. A key limitation is the need to generate\nhigh-quality negative examples for contrastive learning, especially in\nunsupervised tasks, where versatile solutions are currently lacking. To address\nthis, we introduce the Self-Contrastive Forward-Forward (SCFF) method, inspired\nby self-supervised contrastive learning. SCFF generates positive and negative\nexamples applicable across different datasets, surpassing existing local\nforward algorithms for unsupervised classification accuracy on MNIST (MLP:\n98.7%), CIFAR-10 (CNN: 80.75%), and STL-10 (CNN: 77.3%). Additionally, SCFF is\nthe first to enable FF training of recurrent neural networks, opening the door\nto more complex tasks and continuous-time video and text processing.\n","authors":["Xing Chen","Dongshu Liu","Jeremie Laydevant","Julie Grollier"],"pdf_url":"https://arxiv.org/pdf/2409.11593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08717v2","updated":"2024-09-17T22:56:30Z","published":"2023-10-12T20:54:34Z","title":"Designing Observables for Measurements with Deep Learning","summary":" Many analyses in particle and nuclear physics use simulations to infer\nfundamental, effective, or phenomenological parameters of the underlying\nphysics models. When the inference is performed with unfolded cross sections,\nthe observables are designed using physics intuition and heuristics. We propose\nto design targeted observables with machine learning. Unfolded, differential\ncross sections in a neural network output contain the most information about\nparameters of interest and can be well-measured by construction. The networks\nare trained using a custom loss function that rewards outputs that are\nsensitive to the parameter(s) of interest while simultaneously penalizing\noutputs that are different between particle-level and detector-level (to\nminimize detector distortions). We demonstrate this idea in simulation using\ntwo physics models for inclusive measurements in deep inelastic scattering. We\nfind that the new approach is more sensitive than classical observables at\ndistinguishing the two models and also has a reduced unfolding uncertainty due\nto the reduced detector distortions.\n","authors":["Owen Long","Benjamin Nachman"],"pdf_url":"https://arxiv.org/pdf/2310.08717v2.pdf","comment":"This is the version published in EPJC"},{"id":"http://arxiv.org/abs/2409.02257v2","updated":"2024-09-17T22:26:51Z","published":"2024-09-03T19:31:03Z","title":"MMLU-Pro+: Evaluating Higher-Order Reasoning and Shortcut Learning in\n LLMs","summary":" Existing benchmarks for large language models (LLMs) increasingly struggle to\ndifferentiate between top-performing models, underscoring the need for more\nchallenging evaluation frameworks. We introduce MMLU-Pro+, an enhanced\nbenchmark building upon MMLU-Pro to assess shortcut learning and higher-order\nreasoning in LLMs. By incorporating questions with multiple correct answers\nacross diverse domains, MMLU-Pro+ tests LLMs' ability to engage in complex\nreasoning and resist simplistic problem-solving strategies. Our results show\nthat MMLU-Pro+ maintains MMLU-Pro's difficulty while providing a more rigorous\ntest of model discrimination, particularly in multi-correct answer scenarios.\nWe introduce novel metrics like shortcut selection ratio and correct pair\nidentification ratio, offering deeper insights into model behavior and\nanchoring bias. Evaluations of six state-of-the-art LLMs reveal significant\nperformance gaps, highlighting variations in reasoning abilities and bias\nsusceptibility. We release the dataset and evaluation codes at\n\\url{https://github.com/asgsaeid/mmlu-pro-plus}.\n","authors":["Saeid Asgari Taghanaki","Aliasgahr Khani","Amir Khasahmadi"],"pdf_url":"https://arxiv.org/pdf/2409.02257v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11585v1","updated":"2024-09-17T22:20:26Z","published":"2024-09-17T22:20:26Z","title":"Advances in APPFL: A Comprehensive and Extensible Federated Learning\n Framework","summary":" Federated learning (FL) is a distributed machine learning paradigm enabling\ncollaborative model training while preserving data privacy. In today's\nlandscape, where most data is proprietary, confidential, and distributed, FL\nhas become a promising approach to leverage such data effectively, particularly\nin sensitive domains such as medicine and the electric grid. Heterogeneity and\nsecurity are the key challenges in FL, however; most existing FL frameworks\neither fail to address these challenges adequately or lack the flexibility to\nincorporate new solutions. To this end, we present the recent advances in\ndeveloping APPFL, an extensible framework and benchmarking suite for federated\nlearning, which offers comprehensive solutions for heterogeneity and security\nconcerns, as well as user-friendly interfaces for integrating new algorithms or\nadapting to new applications. We demonstrate the capabilities of APPFL through\nextensive experiments evaluating various aspects of FL, including communication\nefficiency, privacy preservation, computational performance, and resource\nutilization. We further highlight the extensibility of APPFL through case\nstudies in vertical, hierarchical, and decentralized FL. APPFL is open-sourced\nat https://github.com/APPFL/APPFL.\n","authors":["Zilinghan Li","Shilan He","Ze Yang","Minseok Ryu","Kibaek Kim","Ravi Madduri"],"pdf_url":"https://arxiv.org/pdf/2409.11585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00809v5","updated":"2024-09-17T22:18:09Z","published":"2023-06-01T15:37:32Z","title":"Initial Guessing Bias: How Untrained Networks Favor Some Classes","summary":" Understanding and controlling biasing effects in neural networks is crucial\nfor ensuring accurate and fair model performance. In the context of\nclassification problems, we provide a theoretical analysis demonstrating that\nthe structure of a deep neural network (DNN) can condition the model to assign\nall predictions to the same class, even before the beginning of training, and\nin the absence of explicit biases. We prove that, besides dataset properties,\nthe presence of this phenomenon, which we call \\textit{Initial Guessing Bias}\n(IGB), is influenced by model choices including dataset preprocessing methods,\nand architectural decisions, such as activation functions, max-pooling layers,\nand network depth. Our analysis of IGB provides information for architecture\nselection and model initialization. We also highlight theoretical consequences,\nsuch as the breakdown of node-permutation symmetry, the violation of\nself-averaging and the non-trivial effects that depth has on the phenomenon.\n","authors":["Emanuele Francazi","Aurelien Lucchi","Marco Baity-Jesi"],"pdf_url":"https://arxiv.org/pdf/2306.00809v5.pdf","comment":"Updated the notation to enhance clarity"},{"id":"http://arxiv.org/abs/2406.08591v3","updated":"2024-09-17T22:02:41Z","published":"2024-06-12T18:54:22Z","title":"MEMO-QCD: Quantum Density Estimation through Memetic Optimisation for\n Quantum Circuit Design","summary":" This paper presents a strategy for efficient quantum circuit design for\ndensity estimation. The strategy is based on a quantum-inspired algorithm for\ndensity estimation and a circuit optimisation routine based on memetic\nalgorithms. The model maps a training dataset to a quantum state represented by\na density matrix through a quantum feature map. This training state encodes the\nprobability distribution of the dataset in a quantum state, such that the\ndensity of a new sample can be estimated by projecting its corresponding\nquantum state onto the training state. We propose the application of a memetic\nalgorithm to find the architecture and parameters of a variational quantum\ncircuit that implements the quantum feature map, along with a variational\nlearning strategy to prepare the training state. Demonstrations of the proposed\nstrategy show an accurate approximation of the Gaussian kernel density\nestimation method through shallow quantum circuits illustrating the feasibility\nof the algorithm for near-term quantum hardware.\n","authors":["Juan E. Ardila-García","Vladimir Vargas-Calderón","Fabio A. González","Diego H. Useche","Herbert Vinck-Posada"],"pdf_url":"https://arxiv.org/pdf/2406.08591v3.pdf","comment":"15 pages, 8 figures, presented at QTML 2023"},{"id":"http://arxiv.org/abs/2409.11576v1","updated":"2024-09-17T22:01:56Z","published":"2024-09-17T22:01:56Z","title":"Automating proton PBS treatment planning for head and neck cancers using\n policy gradient-based deep reinforcement learning","summary":" Proton pencil beam scanning (PBS) treatment planning for head and neck (H&N)\ncancers is a time-consuming and experience-demanding task where a large number\nof planning objectives are involved. Deep reinforcement learning (DRL) has\nrecently been introduced to the planning processes of intensity-modulated\nradiation therapy and brachytherapy for prostate, lung, and cervical cancers.\nHowever, existing approaches are built upon the Q-learning framework and\nweighted linear combinations of clinical metrics, suffering from poor\nscalability and flexibility and only capable of adjusting a limited number of\nplanning objectives in discrete action spaces. We propose an automatic\ntreatment planning model using the proximal policy optimization (PPO) algorithm\nand a dose distribution-based reward function for proton PBS treatment planning\nof H&N cancers. Specifically, a set of empirical rules is used to create\nauxiliary planning structures from target volumes and organs-at-risk (OARs),\nalong with their associated planning objectives. These planning objectives are\nfed into an in-house optimization engine to generate the spot monitor unit (MU)\nvalues. A decision-making policy network trained using PPO is developed to\niteratively adjust the involved planning objective parameters in a continuous\naction space and refine the PBS treatment plans using a novel dose\ndistribution-based reward function. Proton H&N treatment plans generated by the\nmodel show improved OAR sparing with equal or superior target coverage when\ncompared with human-generated plans. Moreover, additional experiments on liver\ncancer demonstrate that the proposed method can be successfully generalized to\nother treatment sites. To the best of our knowledge, this is the first\nDRL-based automatic treatment planning model capable of achieving human-level\nperformance for H&N cancers.\n","authors":["Qingqing Wang","Chang Chang"],"pdf_url":"https://arxiv.org/pdf/2409.11576v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.08609v2","updated":"2024-09-17T21:32:14Z","published":"2022-05-17T19:55:56Z","title":"Bagged Polynomial Regression and Neural Networks","summary":" Series and polynomial regression are able to approximate the same function\nclasses as neural networks. However, these methods are rarely used in practice,\nalthough they offer more interpretability than neural networks. In this paper,\nwe show that a potential reason for this is the slow convergence rate of\npolynomial regression estimators and propose the use of \\textit{bagged}\npolynomial regression (BPR) as an attractive alternative to neural networks.\nTheoretically, we derive new finite sample and asymptotic $L^2$ convergence\nrates for series estimators. We show that the rates can be improved in smooth\nsettings by splitting the feature space and generating polynomial features\nseparately for each partition. Empirically, we show that our proposed\nestimator, the BPR, can perform as well as more complex models with more\nparameters. Our estimator also performs close to state-of-the-art prediction\nmethods in the benchmark MNIST handwritten digit dataset. We demonstrate that\nBPR performs as well as neural networks in crop classification using satellite\ndata, a setting where prediction accuracy is critical and interpretability is\noften required for addressing research questions.\n","authors":["Sylvia Klosin","Jaume Vives-i-Bastida"],"pdf_url":"https://arxiv.org/pdf/2205.08609v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2401.06344v2","updated":"2024-09-17T21:31:09Z","published":"2024-01-12T03:26:06Z","title":"Hyper-STTN: Social Group-aware Spatial-Temporal Transformer Network for\n Human Trajectory Prediction with Hypergraph Reasoning","summary":" Predicting crowded intents and trajectories is crucial in varouls real-world\napplications, including service robots and autonomous vehicles. Understanding\nenvironmental dynamics is challenging, not only due to the complexities of\nmodeling pair-wise spatial and temporal interactions but also the diverse\ninfluence of group-wise interactions. To decode the comprehensive pair-wise and\ngroup-wise interactions in crowded scenarios, we introduce Hyper-STTN, a\nHypergraph-based Spatial-Temporal Transformer Network for crowd trajectory\nprediction. In Hyper-STTN, crowded group-wise correlations are constructed\nusing a set of multi-scale hypergraphs with varying group sizes, captured\nthrough random-walk robability-based hypergraph spectral convolution.\nAdditionally, a spatial-temporal transformer is adapted to capture pedestrians'\npair-wise latent interactions in spatial-temporal dimensions. These\nheterogeneous group-wise and pair-wise are then fused and aligned though a\nmultimodal transformer network. Hyper-STTN outperformes other state-of-the-art\nbaselines and ablation models on 5 real-world pedestrian motion datasets.\n","authors":["Weizheng Wang","Chaowei Wang","Baijian Yang","Guohua Chen","Byung-Cheol Min"],"pdf_url":"https://arxiv.org/pdf/2401.06344v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11564v1","updated":"2024-09-17T21:28:51Z","published":"2024-09-17T21:28:51Z","title":"Preference Tuning with Human Feedback on Language, Speech, and Vision\n Tasks: A Survey","summary":" Preference tuning is a crucial process for aligning deep generative models\nwith human preferences. This survey offers a thorough overview of recent\nadvancements in preference tuning and the integration of human feedback. The\npaper is organized into three main sections: 1) introduction and preliminaries:\nan introduction to reinforcement learning frameworks, preference tuning tasks,\nmodels, and datasets across various modalities: language, speech, and vision,\nas well as different policy approaches, 2) in-depth examination of each\npreference tuning approach: a detailed analysis of the methods used in\npreference tuning, and 3) applications, discussion, and future directions: an\nexploration of the applications of preference tuning in downstream tasks,\nincluding evaluation methods for different modalities, and an outlook on future\nresearch directions. Our objective is to present the latest methodologies in\npreference tuning and model alignment, enhancing the understanding of this\nfield for researchers and practitioners. We hope to encourage further\nengagement and innovation in this area.\n","authors":["Genta Indra Winata","Hanyang Zhao","Anirban Das","Wenpin Tang","David D. Yao","Shi-Xiong Zhang","Sambit Sahu"],"pdf_url":"https://arxiv.org/pdf/2409.11564v1.pdf","comment":"Survey paper"},{"id":"http://arxiv.org/abs/2409.11560v1","updated":"2024-09-17T21:17:59Z","published":"2024-09-17T21:17:59Z","title":"Discrete Unit based Masking for Improving Disentanglement in Voice\n Conversion","summary":" Voice conversion (VC) aims to modify the speaker's identity while preserving\nthe linguistic content. Commonly, VC methods use an encoder-decoder\narchitecture, where disentangling the speaker's identity from linguistic\ninformation is crucial. However, the disentanglement approaches used in these\nmethods are limited as the speaker features depend on the phonetic content of\nthe utterance, compromising disentanglement. This dependency is amplified with\nattention-based methods. To address this, we introduce a novel masking\nmechanism in the input before speaker encoding, masking certain discrete speech\nunits that correspond highly with phoneme classes. Our work aims to reduce the\nphonetic dependency of speaker features by restricting access to some phonetic\ninformation. Furthermore, since our approach is at the input level, it is\napplicable to any encoder-decoder based VC framework. Our approach improves\ndisentanglement and conversion performance across multiple VC methods, showing\nsignificant effectiveness, particularly in attention-based method, with 44%\nrelative improvement in objective intelligibility.\n","authors":["Philip H. Lee","Ismail Rasim Ulgen","Berrak Sisman"],"pdf_url":"https://arxiv.org/pdf/2409.11560v1.pdf","comment":"Accepted to IEEE SLT 2024"},{"id":"http://arxiv.org/abs/2407.13862v2","updated":"2024-09-17T21:17:54Z","published":"2024-07-18T19:15:52Z","title":"Enhancing Worldwide Image Geolocation by Ensembling Satellite-Based\n Ground-Level Attribute Predictors","summary":" We examine the challenge of estimating the location of a single ground-level\nimage in the absence of GPS or other location metadata. Currently, geolocation\nsystems are evaluated by measuring the Great Circle Distance between the\npredicted location and ground truth. Because this measurement only uses a\nsingle point, it cannot assess the distribution of predictions by geolocation\nsystems. Evaluation of a distribution of potential locations (areas) is\nrequired when there are follow-on procedures to further narrow down or verify\nthe location. This is especially important in poorly-sampled regions e.g. rural\nand wilderness areas.\n In this paper, we introduce a novel metric, Recall vs Area (RvA), which\nmeasures the accuracy of estimated distributions of locations. RvA treats image\ngeolocation results similarly to document retrieval, measuring recall as a\nfunction of area: For a ranked list of (possibly discontiguous) predicted\nregions, we measure the area required for accumulated regions to contain the\nground truth coordinate. This produces a curve similar to a precision-recall\ncurve, where \"precision\" is replaced by square kilometers area, enabling\nevaluation for different downstream search area budgets.\n Following from this view of the problem, we then examine an ensembling\napproach to global-scale image geolocation, which incorporates information from\nmultiple sources, and can readily incorporate multiple models, attribute\npredictors, and data sources. We study its effectiveness by combining the\ngeolocation models GeoEstimation and the current state-of-the-art, GeoCLIP,\nwith attribute predictors based on Oak Ridge National Laboratory LandScan and\nEuropean Space Agency Climate Change Initiative Land Cover. We find significant\nimprovements in image geolocation for areas that are under-represented in the\ntraining set, particularly non-urban areas, on both Im2GPS3k and Street View\nimages.\n","authors":["Michael J. Bianco","David Eigen","Michael Gormish"],"pdf_url":"https://arxiv.org/pdf/2407.13862v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11554v1","updated":"2024-09-17T20:52:50Z","published":"2024-09-17T20:52:50Z","title":"A Property Encoder for Graph Neural Networks","summary":" Graph machine learning, particularly using graph neural networks,\nfundamentally relies on node features. Nevertheless, numerous real-world\nsystems, such as social and biological networks, often lack node features due\nto various reasons, including privacy concerns, incomplete or missing data, and\nlimitations in data collection. In such scenarios, researchers typically resort\nto methods like structural and positional encoding to construct node features.\nHowever, the length of such features is contingent on the maximum value within\nthe property being encoded, for example, the highest node degree, which can be\nexceedingly large in applications like scale-free networks. Furthermore, these\nencoding schemes are limited to categorical data and might not be able to\nencode metrics returning other type of values. In this paper, we introduce a\nnovel, universally applicable encoder, termed PropEnc, which constructs\nexpressive node embedding from any given graph metric. PropEnc leverages\nhistogram construction combined with reverse index encoding, offering a\nflexible method for node features initialization. It supports flexible encoding\nin terms of both dimensionality and type of input, demonstrating its\neffectiveness across diverse applications. PropEnc allows encoding metrics in\nlow-dimensional space which effectively avoids the issue of sparsity and\nenhances the efficiency of the models. We show that \\emph{PropEnc} can\nconstruct node features that either exactly replicate one-hot encoding or\nclosely approximate indices under various settings. Our extensive evaluations\nin graph classification setting across multiple social networks that lack node\nfeatures support our hypothesis. The empirical results conclusively demonstrate\nthat PropEnc is both an efficient and effective mechanism for constructing node\nfeatures from diverse set of graph metrics.\n","authors":["Anwar Said","Xenofon Koutsoukos"],"pdf_url":"https://arxiv.org/pdf/2409.11554v1.pdf","comment":"conference paper"},{"id":"http://arxiv.org/abs/2409.11542v1","updated":"2024-09-17T20:30:35Z","published":"2024-09-17T20:30:35Z","title":"VALO: A Versatile Anytime Framework for LiDAR-based Object Detection\n Deep Neural Networks","summary":" This work addresses the challenge of adapting dynamic deadline requirements\nfor LiDAR object detection deep neural networks (DNNs). The computing latency\nof object detection is critically important to ensure safe and efficient\nnavigation. However, state-of-the-art LiDAR object detection DNNs often exhibit\nsignificant latency, hindering their real-time performance on\nresource-constrained edge platforms. Therefore, a tradeoff between detection\naccuracy and latency should be dynamically managed at runtime to achieve\noptimum results.\n In this paper, we introduce VALO (Versatile Anytime algorithm for LiDAR\nObject detection), a novel data-centric approach that enables anytime computing\nof 3D LiDAR object detection DNNs. VALO employs a deadline-aware scheduler to\nselectively process input regions, making execution time and accuracy tradeoffs\nwithout architectural modifications. Additionally, it leverages efficient\nforecasting of past detection results to mitigate possible loss of accuracy due\nto partial processing of input. Finally, it utilizes a novel input reduction\ntechnique within its detection heads to significantly accelerate execution\nwithout sacrificing accuracy.\n We implement VALO on state-of-the-art 3D LiDAR object detection networks,\nnamely CenterPoint and VoxelNext, and demonstrate its dynamic adaptability to a\nwide range of time constraints while achieving higher accuracy than the prior\nstate-of-the-art. Code is available\nathttps://github.com/CSL-KU/VALO}{github.com/CSL-KU/VALO.\n","authors":["Ahmet Soyyigit","Shuochao Yao","Heechul Yun"],"pdf_url":"https://arxiv.org/pdf/2409.11542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11535v1","updated":"2024-09-17T20:13:32Z","published":"2024-09-17T20:13:32Z","title":"Balancing Optimality and Diversity: Human-Centered Decision Making\n through Generative Curation","summary":" The surge in data availability has inundated decision-makers with an\noverwhelming array of choices. While existing approaches focus on optimizing\ndecisions based on quantifiable metrics, practical decision-making often\nrequires balancing measurable quantitative criteria with unmeasurable\nqualitative factors embedded in the broader context. In such cases, algorithms\ncan generate high-quality recommendations, but the final decision rests with\nthe human, who must weigh both dimensions. We define the process of selecting\nthe optimal set of algorithmic recommendations in this context as\nhuman-centered decision making. To address this challenge, we introduce a novel\nframework called generative curation, which optimizes the true desirability of\ndecision options by integrating both quantitative and qualitative aspects. Our\nframework uses a Gaussian process to model unknown qualitative factors and\nderives a diversity metric that balances quantitative optimality with\nqualitative diversity. This trade-off enables the generation of a manageable\nsubset of diverse, near-optimal actions that are robust to unknown qualitative\npreferences. To operationalize this framework, we propose two implementation\napproaches: a generative neural network architecture that produces a\ndistribution $\\pi$ to efficiently sample a diverse set of near-optimal actions,\nand a sequential optimization method to iteratively generates solutions that\ncan be easily incorporated into complex optimization formulations. We validate\nour approach with extensive datasets, demonstrating its effectiveness in\nenhancing decision-making processes across a range of complex environments,\nwith significant implications for policy and management.\n","authors":["Michael Lingzhi Li","Shixiang Zhu"],"pdf_url":"https://arxiv.org/pdf/2409.11535v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2409.11402v1","updated":"2024-09-17T17:59:06Z","published":"2024-09-17T17:59:06Z","title":"NVLM: Open Frontier-Class Multimodal LLMs","summary":" We introduce NVLM 1.0, a family of frontier-class multimodal large language\nmodels (LLMs) that achieve state-of-the-art results on vision-language tasks,\nrivaling the leading proprietary models (e.g., GPT-4o) and open-access models\n(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved\ntext-only performance over its LLM backbone after multimodal training. In terms\nof model design, we perform a comprehensive comparison between decoder-only\nmultimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g.,\nFlamingo). Based on the strengths and weaknesses of both approaches, we propose\na novel architecture that enhances both training efficiency and multimodal\nreasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for\ntile-based dynamic high-resolution images, which significantly boosts\nperformance on multimodal reasoning and OCR-related tasks. Regarding training\ndata, we meticulously curate and provide detailed information on our multimodal\npretraining and supervised fine-tuning datasets. Our findings indicate that\ndataset quality and task diversity are more important than scale, even during\nthe pretraining phase, across all architectures. Notably, we develop\nproduction-grade multimodality for the NVLM-1.0 models, enabling them to excel\nin vision-language tasks while maintaining and even improving text-only\nperformance compared to their LLM backbones. To achieve this, we craft and\nintegrate a high-quality text-only dataset into multimodal training, alongside\na substantial amount of multimodal math and reasoning data, leading to enhanced\nmath and coding capabilities across modalities. To advance research in the\nfield, we are releasing the model weights and will open-source the code for the\ncommunity: https://nvlm-project.github.io/.\n","authors":["Wenliang Dai","Nayeon Lee","Boxin Wang","Zhuoling Yang","Zihan Liu","Jon Barker","Tuomas Rintamaki","Mohammad Shoeybi","Bryan Catanzaro","Wei Ping"],"pdf_url":"https://arxiv.org/pdf/2409.11402v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11286v1","updated":"2024-09-17T15:41:04Z","published":"2024-09-17T15:41:04Z","title":"Enhancing Few-Shot Classification without Forgetting through Multi-Level\n Contrastive Constraints","summary":" Most recent few-shot learning approaches are based on meta-learning with\nepisodic training. However, prior studies encounter two crucial problems: (1)\n\\textit{the presence of inductive bias}, and (2) \\textit{the occurrence of\ncatastrophic forgetting}. In this paper, we propose a novel Multi-Level\nContrastive Constraints (MLCC) framework, that jointly integrates\nwithin-episode learning and across-episode learning into a unified interactive\nlearning paradigm to solve these issues. Specifically, we employ a space-aware\ninteraction modeling scheme to explore the correct inductive paradigms for each\nclass between within-episode similarity/dis-similarity distributions.\nAdditionally, with the aim of better utilizing former prior knowledge, a\ncross-stage distribution adaption strategy is designed to align the\nacross-episode distributions from different time stages, thus reducing the\nsemantic gap between existing and past prediction distribution. Extensive\nexperiments on multiple few-shot datasets demonstrate the consistent\nsuperiority of MLCC approach over the existing state-of-the-art baselines.\n","authors":["Bingzhi Chen","Haoming Zhou","Yishu Liu","Biqing Zeng","Jiahui Pan","Guangming Lu"],"pdf_url":"https://arxiv.org/pdf/2409.11286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10994v1","updated":"2024-09-17T08:56:27Z","published":"2024-09-17T08:56:27Z","title":"Less is More: A Simple yet Effective Token Reduction Method for\n Efficient Multi-modal LLMs","summary":" The rapid advancement of Multimodal Large Language Models (MLLMs) has led to\nremarkable performances across various domains. However, this progress is\naccompanied by a substantial surge in the resource consumption of these models.\nWe address this pressing issue by introducing a new approach, Token Reduction\nusing CLIP Metric (TRIM), aimed at improving the efficiency of MLLMs without\nsacrificing their performance. Inspired by human attention patterns in Visual\nQuestion Answering (VQA) tasks, TRIM presents a fresh perspective on the\nselection and reduction of image tokens. The TRIM method has been extensively\ntested across 12 datasets, and the results demonstrate a significant reduction\nin computational overhead while maintaining a consistent level of performance.\nThis research marks a critical stride in efficient MLLM development, promoting\ngreater accessibility and sustainability of high-performing models.\n","authors":["Dingjie Song","Wenjun Wang","Shunian Chen","Xidong Wang","Michael Guan","Benyou Wang"],"pdf_url":"https://arxiv.org/pdf/2409.10994v1.pdf","comment":"9 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2409.10958v1","updated":"2024-09-17T07:52:09Z","published":"2024-09-17T07:52:09Z","title":"Towards Effective User Attribution for Latent Diffusion Models via\n Watermark-Informed Blending","summary":" Rapid advancements in multimodal large language models have enabled the\ncreation of hyper-realistic images from textual descriptions. However, these\nadvancements also raise significant concerns about unauthorized use, which\nhinders their broader distribution. Traditional watermarking methods often\nrequire complex integration or degrade image quality. To address these\nchallenges, we introduce a novel framework Towards Effective user Attribution\nfor latent diffusion models via Watermark-Informed Blending (TEAWIB). TEAWIB\nincorporates a unique ready-to-use configuration approach that allows seamless\nintegration of user-specific watermarks into generative models. This approach\nensures that each user can directly apply a pre-configured set of parameters to\nthe model without altering the original model parameters or compromising image\nquality. Additionally, noise and augmentation operations are embedded at the\npixel level to further secure and stabilize watermarked images. Extensive\nexperiments validate the effectiveness of TEAWIB, showcasing the\nstate-of-the-art performance in perceptual quality and attribution accuracy.\n","authors":["Yongyang Pan","Xiaohong Liu","Siqi Luo","Yi Xin","Xiao Guo","Xiaoming Liu","Xiongkuo Min","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.10958v1.pdf","comment":"9 pages, 7 figures"},{"id":"http://arxiv.org/abs/2409.10848v1","updated":"2024-09-17T02:30:34Z","published":"2024-09-17T02:30:34Z","title":"3DFacePolicy: Speech-Driven 3D Facial Animation with Diffusion Policy","summary":" Audio-driven 3D facial animation has made immersive progress both in research\nand application developments. The newest approaches focus on Transformer-based\nmethods and diffusion-based methods, however, there is still gap in the\nvividness and emotional expression between the generated animation and real\nhuman face. To tackle this limitation, we propose 3DFacePolicy, a diffusion\npolicy model for 3D facial animation prediction. This method generates variable\nand realistic human facial movements by predicting the 3D vertex trajectory on\nthe 3D facial template with diffusion policy instead of facial generation for\nevery frame. It takes audio and vertex states as observations to predict the\nvertex trajectory and imitate real human facial expressions, which keeps the\ncontinuous and natural flow of human emotions. The experiments show that our\napproach is effective in variable and dynamic facial motion synthesizing.\n","authors":["Xuanmeng Sha","Liyun Zhang","Tomohiro Mashita","Yuki Uranishi"],"pdf_url":"https://arxiv.org/pdf/2409.10848v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10831v1","updated":"2024-09-17T01:48:42Z","published":"2024-09-17T01:48:42Z","title":"PDMX: A Large-Scale Public Domain MusicXML Dataset for Symbolic Music\n Processing","summary":" The recent explosion of generative AI-Music systems has raised numerous\nconcerns over data copyright, licensing music from musicians, and the conflict\nbetween open-source AI and large prestige companies. Such issues highlight the\nneed for publicly available, copyright-free musical data, in which there is a\nlarge shortage, particularly for symbolic music data. To alleviate this issue,\nwe present PDMX: a large-scale open-source dataset of over 250K public domain\nMusicXML scores collected from the score-sharing forum MuseScore, making it the\nlargest available copyright-free symbolic music dataset to our knowledge. PDMX\nadditionally includes a wealth of both tag and user interaction metadata,\nallowing us to efficiently analyze the dataset and filter for high quality\nuser-generated scores. Given the additional metadata afforded by our data\ncollection process, we conduct multitrack music generation experiments\nevaluating how different representative subsets of PDMX lead to different\nbehaviors in downstream models, and how user-rating statistics can be used as\nan effective measure of data quality. Examples can be found at\nhttps://pnlong.github.io/PDMX.demo/.\n","authors":["Phillip Long","Zachary Novack","Taylor Berg-Kirkpatrick","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2409.10831v1.pdf","comment":null}]},"2024-09-22T00:00:00Z":{"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2409.12014v2","updated":"2024-09-22T14:57:35Z","published":"2024-09-18T14:28:52Z","title":"BRDF-NeRF: Neural Radiance Fields with Optical Satellite Images and BRDF\n Modelling","summary":" Understanding the anisotropic reflectance of complex Earth surfaces from\nsatellite imagery is crucial for numerous applications. Neural radiance fields\n(NeRF) have become popular as a machine learning technique capable of deducing\nthe bidirectional reflectance distribution function (BRDF) of a scene from\nmultiple images. However, prior research has largely concentrated on applying\nNeRF to close-range imagery, estimating basic Microfacet BRDF models, which\nfall short for many Earth surfaces. Moreover, high-quality NeRFs generally\nrequire several images captured simultaneously, a rare occurrence in satellite\nimaging. To address these limitations, we propose BRDF-NeRF, developed to\nexplicitly estimate the Rahman-Pinty-Verstraete (RPV) model, a semi-empirical\nBRDF model commonly employed in remote sensing. We assess our approach using\ntwo datasets: (1) Djibouti, captured in a single epoch at varying viewing\nangles with a fixed Sun position, and (2) Lanzhou, captured over multiple\nepochs with different viewing angles and Sun positions. Our results, based on\nonly three to four satellite images for training, demonstrate that BRDF-NeRF\ncan effectively synthesize novel views from directions far removed from the\ntraining data and produce high-quality digital surface models (DSMs).\n","authors":["Lulin Zhang","Ewelina Rupnik","Tri Dung Nguyen","Stéphane Jacquemoud","Yann Klinger"],"pdf_url":"https://arxiv.org/pdf/2409.12014v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.10533v3","updated":"2024-09-22T16:40:09Z","published":"2024-08-31T00:59:29Z","title":"Ethical Challenges in Computer Vision: Ensuring Privacy and Mitigating\n Bias in Publicly Available Datasets","summary":" This paper aims to shed light on the ethical problems of creating and\ndeploying computer vision tech, particularly in using publicly available\ndatasets. Due to the rapid growth of machine learning and artificial\nintelligence, computer vision has become a vital tool in many industries,\nincluding medical care, security systems, and trade. However, extensive use of\nvisual data that is often collected without consent due to an informed\ndiscussion of its ramifications raises significant concerns about privacy and\nbias. The paper also examines these issues by analyzing popular datasets such\nas COCO, LFW, ImageNet, CelebA, PASCAL VOC, etc., that are usually used for\ntraining computer vision models. We offer a comprehensive ethical framework\nthat addresses these challenges regarding the protection of individual rights,\nminimization of bias as well as openness and responsibility. We aim to\nencourage AI development that will take into account societal values as well as\nethical standards to avoid any public harm.\n","authors":["Ghalib Ahmed Tahir"],"pdf_url":"https://arxiv.org/pdf/2409.10533v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2409.11718v2","updated":"2024-09-22T08:23:33Z","published":"2024-09-18T05:55:01Z","title":"Free-VSC: Free Semantics from Visual Foundation Models for Unsupervised\n Video Semantic Compression","summary":" Unsupervised video semantic compression (UVSC), i.e., compressing videos to\nbetter support various analysis tasks, has recently garnered attention.\nHowever, the semantic richness of previous methods remains limited, due to the\nsingle semantic learning objective, limited training data, etc. To address\nthis, we propose to boost the UVSC task by absorbing the off-the-shelf rich\nsemantics from VFMs. Specifically, we introduce a VFMs-shared semantic\nalignment layer, complemented by VFM-specific prompts, to flexibly align\nsemantics between the compressed video and various VFMs. This allows different\nVFMs to collaboratively build a mutually-enhanced semantic space, guiding the\nlearning of the compression model. Moreover, we introduce a dynamic\ntrajectory-based inter-frame compression scheme, which first estimates the\nsemantic trajectory based on the historical content, and then traverses along\nthe trajectory to predict the future semantics as the coding context. This\nreduces the overall bitcost of the system, further improving the compression\nefficiency. Our approach outperforms previous coding methods on three\nmainstream tasks and six datasets.\n","authors":["Yuan Tian","Guo Lu","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2409.11718v2.pdf","comment":"ECCV2024"}]},"2024-09-24T00:00:00Z":{"Information Retrieval":[{"id":"http://arxiv.org/abs/2409.06793v2","updated":"2024-09-24T02:09:10Z","published":"2024-09-10T18:02:51Z","title":"Adversarial Attacks to Multi-Modal Models","summary":" Multi-modal models have gained significant attention due to their powerful\ncapabilities. These models effectively align embeddings across diverse data\nmodalities, showcasing superior performance in downstream tasks compared to\ntheir unimodal counterparts. Recent study showed that the attacker can\nmanipulate an image or audio file by altering it in such a way that its\nembedding matches that of an attacker-chosen targeted input, thereby deceiving\ndownstream models. However, this method often underperforms due to inherent\ndisparities in data from different modalities. In this paper, we introduce\nCrossFire, an innovative approach to attack multi-modal models. CrossFire\nbegins by transforming the targeted input chosen by the attacker into a format\nthat matches the modality of the original image or audio file. We then\nformulate our attack as an optimization problem, aiming to minimize the angular\ndeviation between the embeddings of the transformed input and the modified\nimage or audio file. Solving this problem determines the perturbations to be\nadded to the original media. Our extensive experiments on six real-world\nbenchmark datasets reveal that CrossFire can significantly manipulate\ndownstream tasks, surpassing existing attacks. Additionally, we evaluate six\ndefensive strategies against CrossFire, finding that current defenses are\ninsufficient to counteract our CrossFire.\n","authors":["Zhihao Dou","Xin Hu","Haibo Yang","Zhuqing Liu","Minghong Fang"],"pdf_url":"https://arxiv.org/pdf/2409.06793v2.pdf","comment":"To appear in the ACM Workshop on Large AI Systems and Models with\n Privacy and Safety Analysis 2024 (LAMPS '24)"}]},"2024-09-21T00:00:00Z":{"Machine Learning":[{"id":"http://arxiv.org/abs/2409.11884v2","updated":"2024-09-21T06:36:21Z","published":"2024-09-18T11:30:30Z","title":"Recent Advances in OOD Detection: Problems and Approaches","summary":" Out-of-distribution (OOD) detection aims to detect test samples outside the\ntraining category space, which is an essential component in building reliable\nmachine learning systems. Existing reviews on OOD detection primarily focus on\nmethod taxonomy, surveying the field by categorizing various approaches.\nHowever, many recent works concentrate on non-traditional OOD detection\nscenarios, such as test-time adaptation, multi-modal data sources and other\nnovel contexts. In this survey, we uniquely review recent advances in OOD\ndetection from the problem scenario perspective for the first time. According\nto whether the training process is completely controlled, we divide OOD\ndetection methods into training-driven and training-agnostic. Besides,\nconsidering the rapid development of pre-trained models, large pre-trained\nmodel-based OOD detection is also regarded as an important category and\ndiscussed separately. Furthermore, we provide a discussion of the evaluation\nscenarios, a variety of applications, and several future research directions.\nWe believe this survey with new taxonomy will benefit the proposal of new\nmethods and the expansion of more practical scenarios. A curated list of\nrelated papers is provided in the Github repository:\nhttps://github.com/shuolucs/Awesome-Out-Of-Distribution-Detection\n","authors":["Shuo Lu","Yingsheng Wang","Lijun Sheng","Aihua Zheng","Lingxiao He","Jian Liang"],"pdf_url":"https://arxiv.org/pdf/2409.11884v2.pdf","comment":"First Submitted in May 2024"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..e94472fc --- /dev/null +++ b/index.html @@ -0,0 +1,19336 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Information Retrieval 1 + +
+
+
+ + ♻ ☆ Adversarial Attacks to Multi-Modal Models + + +
+ Multi-modal models have gained significant attention due to their powerful +capabilities. These models effectively align embeddings across diverse data +modalities, showcasing superior performance in downstream tasks compared to +their unimodal counterparts. Recent study showed that the attacker can +manipulate an image or audio file by altering it in such a way that its +embedding matches that of an attacker-chosen targeted input, thereby deceiving +downstream models. However, this method often underperforms due to inherent +disparities in data from different modalities. In this paper, we introduce +CrossFire, an innovative approach to attack multi-modal models. CrossFire +begins by transforming the targeted input chosen by the attacker into a format +that matches the modality of the original image or audio file. We then +formulate our attack as an optimization problem, aiming to minimize the angular +deviation between the embeddings of the transformed input and the modified +image or audio file. Solving this problem determines the perturbations to be +added to the original media. Our extensive experiments on six real-world +benchmark datasets reveal that CrossFire can significantly manipulate +downstream tasks, surpassing existing attacks. Additionally, we evaluate six +defensive strategies against CrossFire, finding that current defenses are +insufficient to counteract our CrossFire. + +
+
+ comment: To appear in the ACM Workshop on Large AI Systems and Models with + Privacy and Safety Analysis 2024 (LAMPS '24) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 2 + +
+
+
+ + ♻ ☆ A Controlled Study on Long Context Extension and Generalization in LLMs + + +
+ Broad textual understanding and in-context learning require language models +that utilize full document contexts. Due to the implementation challenges +associated with directly training long-context models, many methods have been +proposed for extending models to handle long contexts. However, owing to +differences in data and model classes, it has been challenging to compare these +approaches, leading to uncertainty as to how to evaluate long-context +performance and whether it differs from standard evaluation. We implement a +controlled protocol for extension methods with a standardized evaluation, +utilizing consistent base models and extension data. Our study yields several +insights into long-context behavior. First, we reaffirm the critical role of +perplexity as a general-purpose performance indicator even in longer-context +tasks. Second, we find that current approximate attention methods +systematically underperform across long-context tasks. Finally, we confirm that +exact fine-tuning based methods are generally effective within the range of +their extension, whereas extrapolation remains challenging. All codebases, +models, and checkpoints will be made available open-source, promoting +transparency and facilitating further research in this critical area of AI +development. + +
+
+
+
+
+ + ♻ ☆ Evaluating language models as risk scores + + +
+ Current question-answering benchmarks predominantly focus on accuracy in +realizable prediction tasks. Conditioned on a question and answer-key, does the +most likely token match the ground truth? Such benchmarks necessarily fail to +evaluate LLMs' ability to quantify ground-truth outcome uncertainty. In this +work, we focus on the use of LLMs as risk scores for unrealizable prediction +tasks. We introduce folktexts, a software package to systematically generate +risk scores using LLMs, and evaluate them against US Census data products. A +flexible API enables the use of different prompting schemes, local or +web-hosted models, and diverse census columns that can be used to compose +custom prediction tasks. We evaluate 17 recent LLMs across five proposed +benchmark tasks. We find that zero-shot risk scores produced by multiple-choice +question-answering have high predictive signal but are widely miscalibrated. +Base models consistently overestimate outcome uncertainty, while +instruction-tuned models underestimate uncertainty and produce over-confident +risk scores. In fact, instruction-tuning polarizes answer distribution +regardless of true underlying data uncertainty. This reveals a general +inability of instruction-tuned LLMs to express data uncertainty using +multiple-choice answers. A separate experiment using verbalized chat-style risk +queries yields substantially improved calibration across instruction-tuned +models. These differences in ability to quantify data uncertainty cannot be +revealed in realizable settings, and highlight a blind-spot in the current +evaluation ecosystem that folktexts covers. + +
+
+
+
+
+
+
+
+ + Machine Learning 3 + +
+
+
+ + ♻ ☆ A Controlled Study on Long Context Extension and Generalization in LLMs + + +
+ Broad textual understanding and in-context learning require language models +that utilize full document contexts. Due to the implementation challenges +associated with directly training long-context models, many methods have been +proposed for extending models to handle long contexts. However, owing to +differences in data and model classes, it has been challenging to compare these +approaches, leading to uncertainty as to how to evaluate long-context +performance and whether it differs from standard evaluation. We implement a +controlled protocol for extension methods with a standardized evaluation, +utilizing consistent base models and extension data. Our study yields several +insights into long-context behavior. First, we reaffirm the critical role of +perplexity as a general-purpose performance indicator even in longer-context +tasks. Second, we find that current approximate attention methods +systematically underperform across long-context tasks. Finally, we confirm that +exact fine-tuning based methods are generally effective within the range of +their extension, whereas extrapolation remains challenging. All codebases, +models, and checkpoints will be made available open-source, promoting +transparency and facilitating further research in this critical area of AI +development. + +
+
+
+
+
+ + ♻ ☆ Calibration Error for Decision Making + + +
+ Calibration allows predictions to be reliably interpreted as probabilities by +decision makers. We propose a decision-theoretic calibration error, the +Calibration Decision Loss (CDL), defined as the maximum improvement in decision +payoff obtained by calibrating the predictions, where the maximum is over all +payoff-bounded decision tasks. Vanishing CDL guarantees the payoff loss from +miscalibration vanishes simultaneously for all downstream decision tasks. We +show separations between CDL and existing calibration error metrics, including +the most well-studied metric Expected Calibration Error (ECE). Our main +technical contribution is a new efficient algorithm for online calibration that +achieves near-optimal $O(\frac{\log T}{\sqrt{T}})$ expected CDL, bypassing the +$\Omega(T^{-0.472})$ lower bound for ECE by Qiao and Valiant (2021). + +
+
+ comment: In FOCS 2024 +
+
+
+
+
+ + ♻ ☆ Handling Long-Term Safety and Uncertainty in Safe Reinforcement Learning + + +
+ Safety is one of the key issues preventing the deployment of reinforcement +learning techniques in real-world robots. While most approaches in the Safe +Reinforcement Learning area do not require prior knowledge of constraints and +robot kinematics and rely solely on data, it is often difficult to deploy them +in complex real-world settings. Instead, model-based approaches that +incorporate prior knowledge of the constraints and dynamics into the learning +framework have proven capable of deploying the learning algorithm directly on +the real robot. Unfortunately, while an approximated model of the robot +dynamics is often available, the safety constraints are task-specific and hard +to obtain: they may be too complicated to encode analytically, too expensive to +compute, or it may be difficult to envision a priori the long-term safety +requirements. In this paper, we bridge this gap by extending the safe +exploration method, ATACOM, with learnable constraints, with a particular focus +on ensuring long-term safety and handling of uncertainty. Our approach is +competitive or superior to state-of-the-art methods in final performance while +maintaining safer behavior during training. + +
+
+ comment: Preprint version of a paper accepted to the Conference on Robot + Learning +
+
+
+
+
+
+
+
+ + Multimedia 1 + +
+
+
+ + ♻ ☆ SVDD 2024: The Inaugural Singing Voice Deepfake Detection Challenge + + +
+ With the advancements in singing voice generation and the growing presence of +AI singers on media platforms, the inaugural Singing Voice Deepfake Detection +(SVDD) Challenge aims to advance research in identifying AI-generated singing +voices from authentic singers. This challenge features two tracks: a controlled +setting track (CtrSVDD) and an in-the-wild scenario track (WildSVDD). The +CtrSVDD track utilizes publicly available singing vocal data to generate +deepfakes using state-of-the-art singing voice synthesis and conversion +systems. Meanwhile, the WildSVDD track expands upon the existing SingFake +dataset, which includes data sourced from popular user-generated content +websites. For the CtrSVDD track, we received submissions from 47 teams, with 37 +surpassing our baselines and the top team achieving a 1.65% equal error rate. +For the WildSVDD track, we benchmarked the baselines. This paper reviews these +results, discusses key findings, and outlines future directions for SVDD +research. + +
+
+ comment: 6 pages, Accepted by 2024 IEEE Spoken Language Technology Workshop + (SLT 2024) +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computer Vision and Pattern Recognition 3 + +
+
+
+ + ♻ ☆ BRDF-NeRF: Neural Radiance Fields with Optical Satellite Images and BRDF + Modelling + + +
+ Understanding the anisotropic reflectance of complex Earth surfaces from +satellite imagery is crucial for numerous applications. Neural radiance fields +(NeRF) have become popular as a machine learning technique capable of deducing +the bidirectional reflectance distribution function (BRDF) of a scene from +multiple images. However, prior research has largely concentrated on applying +NeRF to close-range imagery, estimating basic Microfacet BRDF models, which +fall short for many Earth surfaces. Moreover, high-quality NeRFs generally +require several images captured simultaneously, a rare occurrence in satellite +imaging. To address these limitations, we propose BRDF-NeRF, developed to +explicitly estimate the Rahman-Pinty-Verstraete (RPV) model, a semi-empirical +BRDF model commonly employed in remote sensing. We assess our approach using +two datasets: (1) Djibouti, captured in a single epoch at varying viewing +angles with a fixed Sun position, and (2) Lanzhou, captured over multiple +epochs with different viewing angles and Sun positions. Our results, based on +only three to four satellite images for training, demonstrate that BRDF-NeRF +can effectively synthesize novel views from directions far removed from the +training data and produce high-quality digital surface models (DSMs). + +
+
+
+
+
+ + ♻ ☆ Ethical Challenges in Computer Vision: Ensuring Privacy and Mitigating + Bias in Publicly Available Datasets + + +
+ This paper aims to shed light on the ethical problems of creating and +deploying computer vision tech, particularly in using publicly available +datasets. Due to the rapid growth of machine learning and artificial +intelligence, computer vision has become a vital tool in many industries, +including medical care, security systems, and trade. However, extensive use of +visual data that is often collected without consent due to an informed +discussion of its ramifications raises significant concerns about privacy and +bias. The paper also examines these issues by analyzing popular datasets such +as COCO, LFW, ImageNet, CelebA, PASCAL VOC, etc., that are usually used for +training computer vision models. We offer a comprehensive ethical framework +that addresses these challenges regarding the protection of individual rights, +minimization of bias as well as openness and responsibility. We aim to +encourage AI development that will take into account societal values as well as +ethical standards to avoid any public harm. + +
+
+
+
+
+ + ♻ ☆ Free-VSC: Free Semantics from Visual Foundation Models for Unsupervised + Video Semantic Compression ECCV2024 + + +
+ Unsupervised video semantic compression (UVSC), i.e., compressing videos to +better support various analysis tasks, has recently garnered attention. +However, the semantic richness of previous methods remains limited, due to the +single semantic learning objective, limited training data, etc. To address +this, we propose to boost the UVSC task by absorbing the off-the-shelf rich +semantics from VFMs. Specifically, we introduce a VFMs-shared semantic +alignment layer, complemented by VFM-specific prompts, to flexibly align +semantics between the compressed video and various VFMs. This allows different +VFMs to collaboratively build a mutually-enhanced semantic space, guiding the +learning of the compression model. Moreover, we introduce a dynamic +trajectory-based inter-frame compression scheme, which first estimates the +semantic trajectory based on the historical content, and then traverses along +the trajectory to predict the future semantics as the coding context. This +reduces the overall bitcost of the system, further improving the compression +efficiency. Our approach outperforms previous coding methods on three +mainstream tasks and six datasets. + +
+
+ comment: ECCV2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Machine Learning 1 + +
+
+
+ + ♻ ☆ Recent Advances in OOD Detection: Problems and Approaches + + +
+ Out-of-distribution (OOD) detection aims to detect test samples outside the +training category space, which is an essential component in building reliable +machine learning systems. Existing reviews on OOD detection primarily focus on +method taxonomy, surveying the field by categorizing various approaches. +However, many recent works concentrate on non-traditional OOD detection +scenarios, such as test-time adaptation, multi-modal data sources and other +novel contexts. In this survey, we uniquely review recent advances in OOD +detection from the problem scenario perspective for the first time. According +to whether the training process is completely controlled, we divide OOD +detection methods into training-driven and training-agnostic. Besides, +considering the rapid development of pre-trained models, large pre-trained +model-based OOD detection is also regarded as an important category and +discussed separately. Furthermore, we provide a discussion of the evaluation +scenarios, a variety of applications, and several future research directions. +We believe this survey with new taxonomy will benefit the proposal of new +methods and the expansion of more practical scenarios. A curated list of +related papers is provided in the Github repository: +https://github.com/shuolucs/Awesome-Out-Of-Distribution-Detection + +
+
+ comment: First Submitted in May 2024 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 9 + +
+
+
+ + ♻ ☆ Gender Representation and Bias in Indian Civil Service Mock Interviews + + +
+ This paper makes three key contributions. First, via a substantial corpus of +51,278 interview questions sourced from 888 YouTube videos of mock interviews +of Indian civil service candidates, we demonstrate stark gender bias in the +broad nature of questions asked to male and female candidates. Second, our +experiments with large language models show a strong presence of gender bias in +explanations provided by the LLMs on the gender inference task. Finally, we +present a novel dataset of 51,278 interview questions that can inform future +social science studies. + +
+
+
+
+
+ + ♻ ☆ Contextual Breach: Assessing the Robustness of Transformer-based QA + Models + + +
+ Contextual question-answering models are susceptible to adversarial +perturbations to input context, commonly observed in real-world scenarios. +These adversarial noises are designed to degrade the performance of the model +by distorting the textual input. We introduce a unique dataset that +incorporates seven distinct types of adversarial noise into the context, each +applied at five different intensity levels on the SQuAD dataset. To quantify +the robustness, we utilize robustness metrics providing a standardized measure +for assessing model performance across varying noise types and levels. +Experiments on transformer-based question-answering models reveal robustness +vulnerabilities and important insights into the model's performance in +realistic textual input. + +
+
+
+
+
+ + ♻ ☆ ExtractGPT: Exploring the Potential of Large Language Models for Product + Attribute Value Extraction + + +
+ E-commerce platforms require structured product data in the form of +attribute-value pairs to offer features such as faceted product search or +attribute-based product comparison. However, vendors often provide unstructured +product descriptions, necessitating the extraction of attribute-value pairs +from these texts. BERT-based extraction methods require large amounts of +task-specific training data and struggle with unseen attribute values. This +paper explores using large language models (LLMs) as a more training-data +efficient and robust alternative. We propose prompt templates for zero-shot and +few-shot scenarios, comparing textual and JSON-based target schema +representations. Our experiments show that GPT-4 achieves the highest average +F1-score of 85% using detailed attribute descriptions and demonstrations. +Llama-3-70B performs nearly as well, offering a competitive open-source +alternative. GPT-4 surpasses the best PLM baseline by 5% in F1-score. +Fine-tuning GPT-3.5 increases the performance to the level of GPT-4 but reduces +the model's ability to generalize to unseen attribute values. + +
+
+
+
+
+ + ♻ ☆ Autoregressive + Chain of Thought = Recurrent: Recurrence's Role in + Language Models' Computability and a Revisit of Recurrent Transformer + + +
+ The Transformer architecture excels in a variety of language modeling tasks, +outperforming traditional neural architectures such as RNN and LSTM. This is +partially due to its elimination of recurrent connections, which allows for +parallel training and a smoother flow of gradients. However, this move away +from recurrent structures places the Transformer model at the lower end of +Chomsky's computational hierarchy, imposing limitations on its computational +abilities. Consequently, even advanced Transformer-based models face +considerable difficulties in tasks like counting, string reversal, and +multiplication. These tasks, though seemingly elementary, require a level of +computational complexity that exceeds the capabilities of the Transformer +architecture. Concurrently, the emergence of ``Chain of Thought" (CoT) +prompting has enabled Transformer-based language models to tackle tasks that +were previously impossible or poorly executed. In this work, we thoroughly +investigate the influence of recurrent structures in neural models on their +reasoning abilities and computability, contrasting the role autoregression +plays in the neural models' computational power. We then shed light on how the +CoT approach can mimic recurrent computation and act as a bridge between +autoregression and recurrence in the context of language models. It is this +approximated recurrence that notably improves the model's performance and +computational capacity. Moreover, we revisit recent recurrent-based Transformer +model designs, focusing on their computational abilities through our proposed +concept of ``recurrence-completeness" and identify key theoretical limitations +in models like Linear Transformer and RWKV. Through this, we aim to provide +insight into the neural model architectures and prompt better model design. + +
+
+
+
+
+ + ♻ ☆ Development and bilingual evaluation of Japanese medical large language + model within reasonably low computational resources + + +
+ The recent success of large language models (LLMs) and the scaling law has +led to a widespread adoption of larger models. Particularly in the healthcare +industry, there is an increasing demand for locally operated LLMs due to +security concerns. However, the majority of high quality open-source LLMs have +a size of 70B parameters, imposing significant financial burdens on users for +GPU preparation and operation. To overcome these issues, we present a medical +adaptation based on the recent 7B models, which enables the operation in low +computational resources. We compare the performance on medical +question-answering benchmarks in two languages (Japanese and English), +demonstrating that its scores reach parity with or surpass those of currently +existing medical LLMs that are ten times larger. We find that fine-tuning an +English-centric base model on Japanese medical dataset improves the score in +both language, supporting the effect of cross-lingual knowledge transfer. We +hope that this study will alleviate financial challenges, serving as a stepping +stone for clinical institutions to practically utilize LLMs locally. Our +evaluation code is available at +https://github.com/stardust-coder/japanese-lm-med-harness. + +
+
+ comment: 18 pages, 9 tables +
+
+
+
+
+ + ♻ ☆ Conversational Query Reformulation with the Guidance of Retrieved + Documents + + +
+ Conversational search seeks to retrieve relevant passages for the given +questions in conversational question answering. Conversational Query +Reformulation (CQR) improves conversational search by refining the original +queries into de-contextualized forms to resolve the issues in the original +queries, such as omissions and coreferences. Previous CQR methods focus on +imitating human written queries which may not always yield meaningful search +results for the retriever. In this paper, we introduce GuideCQR, a framework +that refines queries for CQR by leveraging key information from the initially +retrieved documents. Specifically, GuideCQR extracts keywords and generates +expected answers from the retrieved documents, then unifies them with the +queries after filtering to add useful information that enhances the search +process. Experimental results demonstrate that our proposed method achieves +state-of-the-art performance across multiple datasets, outperforming previous +CQR methods. Additionally, we show that GuideCQR can get additional performance +gains in conversational search using various types of queries, even for queries +written by humans. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ♻ ☆ Fast Analysis of the OpenAI O1-Preview Model in Solving Random K-SAT + Problem: Does the LLM Solve the Problem Itself or Call an External SAT + Solver? + + +
+ In this manuscript, I present an analysis on the performance of OpenAI +O1-preview model in solving random K-SAT instances for K$\in {2,3,4}$ as a +function of $\alpha=M/N$ where $M$ is the number of clauses and $N$ is the +number of variables of the satisfiable problem. I show that the model can call +an external SAT solver to solve the instances, rather than solving them +directly. Despite using external solvers, the model reports incorrect +assignments as output. Moreover, I propose and present an analysis to quantify +whether the OpenAI O1-preview model demonstrates a spark of intelligence or +merely makes random guesses when outputting an assignment for a Boolean +satisfiability problem. + +
+
+
+
+
+ + ♻ ☆ Diversity-grounded Channel Prototypical Learning for Out-of-Distribution + Intent Detection + + +
+ In the realm of task-oriented dialogue systems, a robust intent detection +mechanism must effectively handle malformed utterances encountered in +real-world scenarios. This study presents a novel fine-tuning framework for +large language models (LLMs) aimed at enhancing in-distribution (ID) intent +classification and out-of-distribution (OOD) intent detection, which utilizes +semantic matching with prototypes derived from ID class names. By harnessing +the highly distinguishable representations of LLMs, we construct semantic +prototypes for each ID class using a diversity-grounded prompt tuning approach. +We rigorously test our framework in a challenging OOD context, where ID and OOD +classes are semantically close yet distinct, referred to as \emph{near} OOD +detection. For a thorough assessment, we benchmark our method against the +prevalent fine-tuning approaches. The experimental findings reveal that our +method demonstrates superior performance in both few-shot ID intent +classification and near-OOD intent detection tasks. + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ RoMath: A Mathematical Reasoning Benchmark in Romanian + + +
+ Mathematics has long been conveyed through natural language, primarily for +human understanding. With the rise of mechanized mathematics and proof +assistants, there is a growing need to understand informal mathematical text, +yet most existing benchmarks focus solely on English, overlooking other +languages. This paper introduces RoMath, a Romanian mathematical reasoning +benchmark suite comprising three datasets: RoMath-Baccalaureate, +RoMath-Competitions and RoMath-Synthetic, which cover a range of mathematical +domains and difficulty levels, aiming to improve non-English language models +and promote multilingual AI development. By focusing on Romanian, a +low-resource language with unique linguistic features, RoMath addresses the +limitations of Anglo-centric models and emphasizes the need for dedicated +resources beyond simple automatic translation. We benchmark several open-weight +language models, highlighting the importance of creating resources for +underrepresented languages. We make the code and dataset available. + +
+
+ comment: 4 Figures, 12 Tables +
+
+
+
+
+
+
+
+ + Information Retrieval 2 + +
+
+
+ + ♻ ☆ TISIS : Trajectory Indexing for SImilarity Search + + +
+ Social media platforms enable users to share diverse types of information, +including geolocation data that captures their movement patterns. Such +geolocation data can be leveraged to reconstruct the trajectory of a user's +visited Points of Interest (POIs). A key requirement in numerous applications +is the ability to measure the similarity between such trajectories, as this +facilitates the retrieval of trajectories that are similar to a given reference +trajectory. This is the main focus of our work. Existing methods predominantly +rely on applying a similarity function to each candidate trajectory to identify +those that are sufficiently similar. However, this approach becomes +computationally expensive when dealing with large-scale datasets. To mitigate +this challenge, we propose TISIS, an efficient method that uses trajectory +indexing to quickly find similar trajectories that share common POIs in the +same order. Furthermore, to account for scenarios where POIs in trajectories +may not exactly match but are contextually similar, we introduce TISIS*, a +variant of TISIS that incorporates POI embeddings. This extension allows for +more comprehensive retrieval of similar trajectories by considering semantic +similarities between POIs, beyond mere exact matches. Extensive experimental +evaluations demonstrate that the proposed approach significantly outperforms a +baseline method based on the well-known Longest Common SubSequence (LCSS) +algorithm, yielding substantial performance improvements across various +real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Exploring Information Retrieval Landscapes: An Investigation of a Novel + Evaluation Techniques and Comparative Document Splitting Methods + + +
+ The performance of Retrieval-Augmented Generation (RAG) systems in +information retrieval is significantly influenced by the characteristics of the +documents being processed. In this study, the structured nature of textbooks, +the conciseness of articles, and the narrative complexity of novels are shown +to require distinct retrieval strategies. A comparative evaluation of multiple +document-splitting methods reveals that the Recursive Character Splitter +outperforms the Token-based Splitter in preserving contextual integrity. A +novel evaluation technique is introduced, utilizing an open-source model to +generate a comprehensive dataset of question-and-answer pairs, simulating +realistic retrieval scenarios to enhance testing efficiency and metric +reliability. The evaluation employs weighted scoring metrics, including +SequenceMatcher, BLEU, METEOR, and BERT Score, to assess the system's accuracy +and relevance. This approach establishes a refined standard for evaluating the +precision of RAG systems, with future research focusing on optimizing chunk and +overlap sizes to improve retrieval accuracy and efficiency. + +
+
+ comment: This article is 16 pages long and includes detailed comparisons of + RAG systems and document splitting techniques +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 6 + +
+
+
+ + ♻ ☆ Skill matching at scale: freelancer-project alignment for efficient + multilingual candidate retrieval + + +
+ Finding the perfect match between a job proposal and a set of freelancers is +not an easy task to perform at scale, especially in multiple languages. In this +paper, we propose a novel neural retriever architecture that tackles this +problem in a multilingual setting. Our method encodes project descriptions and +freelancer profiles by leveraging pre-trained multilingual language models. The +latter are used as backbone for a custom transformer architecture that aims to +keep the structure of the profiles and project. This model is trained with a +contrastive loss on historical data. Thanks to several experiments, we show +that this approach effectively captures skill matching similarity and +facilitates efficient matching, outperforming traditional methods. + +
+
+
+
+
+ + ♻ ☆ Using Large Language Models to Generate Clinical Trial Tables and + Figures + + +
+ Tables, figures, and listings (TFLs) are essential tools for summarizing +clinical trial data. Creation of TFLs for reporting activities is often a +time-consuming task encountered routinely during the execution of clinical +trials. This study explored the use of large language models (LLMs) to automate +the generation of TFLs through prompt engineering and few-shot transfer +learning. Using public clinical trial data in ADaM format, our results +demonstrated that LLMs can efficiently generate TFLs with prompt instructions, +showcasing their potential in this domain. Furthermore, we developed a +conservational agent named Clinical Trial TFL Generation Agent: An app that +matches user queries to predefined prompts that produce customized programs to +generate specific predefined TFLs. + +
+
+
+
+
+ + ♻ ☆ LOLA -- An Open-Source Massively Multilingual Large Language Model + + +
+ This paper presents LOLA, a massively multilingual large language model +trained on more than 160 languages using a sparse Mixture-of-Experts +Transformer architecture. Our architectural and implementation choices address +the challenge of harnessing linguistic diversity while maintaining efficiency +and avoiding the common pitfalls of multilinguality. Our analysis of the +evaluation results shows competitive performance in natural language generation +and understanding tasks. Additionally, we demonstrate how the learned +expert-routing mechanism exploits implicit phylogenetic linguistic patterns to +potentially alleviate the curse of multilinguality. We provide an in-depth look +at the training process, an analysis of the datasets, and a balanced +exploration of the model's strengths and limitations. As an open-source model, +LOLA promotes reproducibility and serves as a robust foundation for future +research. Our findings enable the development of compute-efficient multilingual +models with strong, scalable performance across languages. + +
+
+
+
+
+ + ♻ ☆ Adversarial Attack for Explanation Robustness of Rationalization Models + + +
+ Rationalization models, which select a subset of input text as +rationale-crucial for humans to understand and trust predictions-have recently +emerged as a prominent research area in eXplainable Artificial Intelligence. +However, most of previous studies mainly focus on improving the quality of the +rationale, ignoring its robustness to malicious attack. Specifically, whether +the rationalization models can still generate high-quality rationale under the +adversarial attack remains unknown. To explore this, this paper proposes UAT2E, +which aims to undermine the explainability of rationalization models without +altering their predictions, thereby eliciting distrust in these models from +human users. UAT2E employs the gradient-based search on triggers and then +inserts them into the original input to conduct both the non-target and target +attack. Experimental results on five datasets reveal the vulnerability of +rationalization models in terms of explanation, where they tend to select more +meaningless tokens under attacks. Based on this, we make a series of +recommendations for improving rationalization models in terms of explanation. + +
+
+
+
+
+ + ♻ ☆ The Art of Storytelling: Multi-Agent Generative AI for Dynamic + Multimodal Narratives + + +
+ This paper introduces the concept of an education tool that utilizes +Generative Artificial Intelligence (GenAI) to enhance storytelling for +children. The system combines GenAI-driven narrative co-creation, +text-to-speech conversion, and text-to-video generation to produce an engaging +experience for learners. We describe the co-creation process, the adaptation of +narratives into spoken words using text-to-speech models, and the +transformation of these narratives into contextually relevant visuals through +text-to-video technology. Our evaluation covers the linguistics of the +generated stories, the text-to-speech conversion quality, and the accuracy of +the generated visuals. + +
+
+
+
+
+ + ♻ ☆ Human-like Affective Cognition in Foundation Models + + +
+ Understanding emotions is fundamental to human interaction and experience. +Humans easily infer emotions from situations or facial expressions, situations +from emotions, and do a variety of other affective cognition. How adept is +modern AI at these inferences? We introduce an evaluation framework for testing +affective cognition in foundation models. Starting from psychological theory, +we generate 1,280 diverse scenarios exploring relationships between appraisals, +emotions, expressions, and outcomes. We evaluate the abilities of foundation +models (GPT-4, Claude-3, Gemini-1.5-Pro) and humans (N = 567) across carefully +selected conditions. Our results show foundation models tend to agree with +human intuitions, matching or exceeding interparticipant agreement. In some +conditions, models are ``superhuman'' -- they better predict modal human +judgements than the average human. All models benefit from chain-of-thought +reasoning. This suggests foundation models have acquired a human-like +understanding of emotions and their influence on beliefs and behavior. + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 4 + +
+
+
+ + ♻ ☆ LaMamba-Diff: Linear-Time High-Fidelity Diffusion Models Based on Local + Attention and Mamba + + +
+ Recent Transformer-based diffusion models have shown remarkable performance, +largely attributed to the ability of the self-attention mechanism to accurately +capture both global and local contexts by computing all-pair interactions among +input tokens. However, their quadratic complexity poses significant +computational challenges for long-sequence inputs. Conversely, a recent state +space model called Mamba offers linear complexity by compressing a filtered +global context into a hidden state. Despite its efficiency, compression +inevitably leads to information loss of fine-grained local dependencies among +tokens, which are crucial for effective visual generative modeling. Motivated +by these observations, we introduce Local Attentional Mamba (LaMamba) blocks +that combine the strengths of self-attention and Mamba, capturing both global +contexts and local details with linear complexity. Leveraging the efficient +U-Net architecture, our model exhibits exceptional scalability and surpasses +the performance of DiT across various model scales on ImageNet at 256x256 +resolution, all while utilizing substantially fewer GFLOPs and a comparable +number of parameters. Compared to state-of-the-art diffusion models on ImageNet +256x256 and 512x512, our largest model presents notable advantages, such as a +reduction of up to 62% GFLOPs compared to DiT-XL/2, while achieving superior +performance with comparable or fewer parameters. Our code is available at +https://github.com/yunxiangfu2001/LaMamba-Diff. + +
+
+
+
+
+ + ♻ ☆ NN-Copula-CD: A Copula-Guided Interpretable Neural Network for Change + Detection in Heterogeneous Remote Sensing Images + + +
+ Change detection (CD) in heterogeneous remote sensing images has been widely +used for disaster monitoring and land-use management. In the past decade, the +heterogeneous CD problem has significantly benefited from the development of +deep neural networks (DNNs). However, the purely data-driven DNNs perform like +a black box where the lack of interpretability limits the trustworthiness and +controllability of DNNs in most practical CD applications. As a powerful +knowledge-driven tool, copula theory performs well in modeling relationships +among random variables. To enhance the interpretability of existing neural +networks for CD, we propose a knowledge-data-driven heterogeneous CD method +based on a copula-guided neural network, named NN-Copula-CD. In our +NN-Copula-CD, the mathematical characteristics of copula are employed as the +loss functions to supervise a neural network to learn the dependence between +bi-temporal heterogeneous superpixel pairs, and then the changed regions are +identified via binary classification based on the degrees of dependence of all +the superpixel pairs in the bi-temporal images. We conduct in-depth experiments +on three datasets with heterogeneous images, where both quantitative and visual +results demonstrate the effectiveness of our proposed NN-Copula-CD method. + +
+
+ comment: The full version of this work is submitted to IEEE TGRS +
+
+
+
+
+ + ♻ ☆ Cross-Organ and Cross-Scanner Adenocarcinoma Segmentation using Rein to + Fine-tune Vision Foundation Models + + +
+ In recent years, significant progress has been made in tumor segmentation +within the field of digital pathology. However, variations in organs, tissue +preparation methods, and image acquisition processes can lead to domain +discrepancies among digital pathology images. To address this problem, in this +paper, we use Rein, a fine-tuning method, to parametrically and efficiently +fine-tune various vision foundation models (VFMs) for MICCAI 2024 Cross-Organ +and Cross-Scanner Adenocarcinoma Segmentation (COSAS2024). The core of Rein +consists of a set of learnable tokens, which are directly linked to instances, +improving functionality at the instance level in each layer. In the data +environment of the COSAS2024 Challenge, extensive experiments demonstrate that +Rein fine-tuned the VFMs to achieve satisfactory results. Specifically, we used +Rein to fine-tune ConvNeXt and DINOv2. Our team used the former to achieve +scores of 0.7719 and 0.7557 on the preliminary test phase and final test phase +in task1, respectively, while the latter achieved scores of 0.8848 and 0.8192 +on the preliminary test phase and final test phase in task2. Code is available +at GitHub. + +
+
+
+
+
+ + ♻ ☆ Adaptive Selection of Sampling-Reconstruction in Fourier Compressed + Sensing ECCV 2024 + + +
+ Compressed sensing (CS) has emerged to overcome the inefficiency of Nyquist +sampling. However, traditional optimization-based reconstruction is slow and +can not yield an exact image in practice. Deep learning-based reconstruction +has been a promising alternative to optimization-based reconstruction, +outperforming it in accuracy and computation speed. Finding an efficient +sampling method with deep learning-based reconstruction, especially for Fourier +CS remains a challenge. Existing joint optimization of sampling-reconstruction +works ($\mathcal{H}_1$) optimize the sampling mask but have low potential as it +is not adaptive to each data point. Adaptive sampling ($\mathcal{H}_2$) has +also disadvantages of difficult optimization and Pareto sub-optimality. Here, +we propose a novel adaptive selection of sampling-reconstruction +($\mathcal{H}_{1.5}$) framework that selects the best sampling mask and +reconstruction network for each input data. We provide theorems that our method +has a higher potential than $\mathcal{H}_1$ and effectively solves the Pareto +sub-optimality problem in sampling-reconstruction by using separate +reconstruction networks for different sampling masks. To select the best +sampling mask, we propose to quantify the high-frequency Bayesian uncertainty +of the input, using a super-resolution space generation model. Our method +outperforms joint optimization of sampling-reconstruction ($\mathcal{H}_1$) and +adaptive sampling ($\mathcal{H}_2$) by achieving significant improvements on +several Fourier CS problems. + +
+
+ comment: 30 pages, 9.8 MB, Accepted to ECCV 2024 +
+
+
+
+
+
+
+
+ + Information Retrieval 3 + +
+
+
+ + ♻ ☆ Skill matching at scale: freelancer-project alignment for efficient + multilingual candidate retrieval + + +
+ Finding the perfect match between a job proposal and a set of freelancers is +not an easy task to perform at scale, especially in multiple languages. In this +paper, we propose a novel neural retriever architecture that tackles this +problem in a multilingual setting. Our method encodes project descriptions and +freelancer profiles by leveraging pre-trained multilingual language models. The +latter are used as backbone for a custom transformer architecture that aims to +keep the structure of the profiles and project. This model is trained with a +contrastive loss on historical data. Thanks to several experiments, we show +that this approach effectively captures skill matching similarity and +facilitates efficient matching, outperforming traditional methods. + +
+
+
+
+
+ + ♻ ☆ LLM-Powered Text Simulation Attack Against ID-Free Recommender Systems + + +
+ The ID-free recommendation paradigm has been proposed to address the +limitation that traditional recommender systems struggle to model cold-start +users or items with new IDs. Despite its effectiveness, this study uncovers +that ID-free recommender systems are vulnerable to the proposed Text Simulation +attack (TextSimu) which aims to promote specific target items. As a novel type +of text poisoning attack, TextSimu exploits large language models (LLM) to +alter the textual information of target items by simulating the characteristics +of popular items. It operates effectively in both black-box and white-box +settings, utilizing two key components: a unified popularity extraction module, +which captures the essential characteristics of popular items, and an N-persona +consistency simulation strategy, which creates multiple personas to +collaboratively synthesize refined promotional textual descriptions for target +items by simulating the popular items. To withstand TextSimu-like attacks, we +further explore the detection approach for identifying LLM-generated +promotional text. Extensive experiments conducted on three datasets demonstrate +that TextSimu poses a more significant threat than existing poisoning attacks, +while our defense method can detect malicious text of target items generated by +TextSimu. By identifying the vulnerability, we aim to advance the development +of more robust ID-free recommender systems. + +
+
+ comment: 12 pages +
+
+
+
+
+ + ♻ ☆ jina-embeddings-v3: Multilingual Embeddings With Task LoRA + + +
+ We introduce jina-embeddings-v3, a novel text embedding model with 570 +million parameters, achieves state-of-the-art performance on multilingual data +and long-context retrieval tasks, supporting context lengths of up to 8192 +tokens. The model includes a set of task-specific Low-Rank Adaptation (LoRA) +adapters to generate high-quality embeddings for query-document retrieval, +clustering, classification, and text matching. Evaluation on the MTEB benchmark +shows that jina-embeddings-v3 outperforms the latest proprietary embeddings +from OpenAI and Cohere on English tasks, while achieving superior performance +compared to multilingual-e5-large-instruct across all multilingual tasks. With +a default output dimension of 1024, users can flexibly reduce the embedding +dimensions to as low as 32 without compromising performance, enabled by +Matryoshka Representation Learning. + +
+
+ comment: 20 pages, pp11-13 references, pp14-20 appendix and experiment tables +
+
+
+
+
+
+
+
+ + Machine Learning 4 + +
+
+
+ + ♻ ☆ Skill matching at scale: freelancer-project alignment for efficient + multilingual candidate retrieval + + +
+ Finding the perfect match between a job proposal and a set of freelancers is +not an easy task to perform at scale, especially in multiple languages. In this +paper, we propose a novel neural retriever architecture that tackles this +problem in a multilingual setting. Our method encodes project descriptions and +freelancer profiles by leveraging pre-trained multilingual language models. The +latter are used as backbone for a custom transformer architecture that aims to +keep the structure of the profiles and project. This model is trained with a +contrastive loss on historical data. Thanks to several experiments, we show +that this approach effectively captures skill matching similarity and +facilitates efficient matching, outperforming traditional methods. + +
+
+
+
+
+ + ♻ ☆ The Impact of Element Ordering on LM Agent Performance + + +
+ There has been a surge of interest in language model agents that can navigate +virtual environments such as the web or desktop. To navigate such environments, +agents benefit from information on the various elements (e.g., buttons, text, +or images) present. It remains unclear which element attributes have the +greatest impact on agent performance, especially in environments that only +provide a graphical representation (i.e., pixels). Here we find that the +ordering in which elements are presented to the language model is surprisingly +impactful--randomizing element ordering in a webpage degrades agent performance +comparably to removing all visible text from an agent's state representation. +While a webpage provides a hierarchical ordering of elements, there is no such +ordering when parsing elements directly from pixels. Moreover, as tasks become +more challenging and models more sophisticated, our experiments suggest that +the impact of ordering increases. Finding an effective ordering is non-trivial. +We investigate the impact of various element ordering methods in web and +desktop environments. We find that dimensionality reduction provides a viable +ordering for pixel-only environments. We train a UI element detection model to +derive elements from pixels and apply our findings to an agent +benchmark--OmniACT--where we only have access to pixels. Our method completes +more than two times as many tasks on average relative to the previous +state-of-the-art. + +
+
+
+
+
+ + ♻ ☆ NN-Copula-CD: A Copula-Guided Interpretable Neural Network for Change + Detection in Heterogeneous Remote Sensing Images + + +
+ Change detection (CD) in heterogeneous remote sensing images has been widely +used for disaster monitoring and land-use management. In the past decade, the +heterogeneous CD problem has significantly benefited from the development of +deep neural networks (DNNs). However, the purely data-driven DNNs perform like +a black box where the lack of interpretability limits the trustworthiness and +controllability of DNNs in most practical CD applications. As a powerful +knowledge-driven tool, copula theory performs well in modeling relationships +among random variables. To enhance the interpretability of existing neural +networks for CD, we propose a knowledge-data-driven heterogeneous CD method +based on a copula-guided neural network, named NN-Copula-CD. In our +NN-Copula-CD, the mathematical characteristics of copula are employed as the +loss functions to supervise a neural network to learn the dependence between +bi-temporal heterogeneous superpixel pairs, and then the changed regions are +identified via binary classification based on the degrees of dependence of all +the superpixel pairs in the bi-temporal images. We conduct in-depth experiments +on three datasets with heterogeneous images, where both quantitative and visual +results demonstrate the effectiveness of our proposed NN-Copula-CD method. + +
+
+ comment: The full version of this work is submitted to IEEE TGRS +
+
+
+
+
+ + ♻ ☆ LOLA -- An Open-Source Massively Multilingual Large Language Model + + +
+ This paper presents LOLA, a massively multilingual large language model +trained on more than 160 languages using a sparse Mixture-of-Experts +Transformer architecture. Our architectural and implementation choices address +the challenge of harnessing linguistic diversity while maintaining efficiency +and avoiding the common pitfalls of multilinguality. Our analysis of the +evaluation results shows competitive performance in natural language generation +and understanding tasks. Additionally, we demonstrate how the learned +expert-routing mechanism exploits implicit phylogenetic linguistic patterns to +potentially alleviate the curse of multilinguality. We provide an in-depth look +at the training process, an analysis of the datasets, and a balanced +exploration of the model's strengths and limitations. As an open-source model, +LOLA promotes reproducibility and serves as a robust foundation for future +research. Our findings enable the development of compute-efficient multilingual +models with strong, scalable performance across languages. + +
+
+
+
+
+
+
+
+ + Multimedia 1 + +
+
+
+ + ♻ ☆ A Simple Baseline with Single-encoder for Referring Image Segmentation + + +
+ Referring image segmentation (RIS) requires dense vision-language +interactions between visual pixels and textual words to segment objects based +on a given description. However, commonly adapted dual-encoders in RIS, e.g., +Swin transformer and BERT (uni-modal encoders) or CLIP (a multi-modal +dual-encoder), lack dense multi-modal interactions during pre-training, leading +to a gap with a pixel-level RIS task. To bridge this gap, existing RIS methods +often rely on multi-modal fusion modules that interact two encoders, but this +approach leads to high computational costs. In this paper, we present a novel +RIS method with a single-encoder, i.e., BEiT-3, maximizing the potential of +shared self-attention across all framework components. This enables seamless +interactions of two modalities from input to final prediction, producing +granularly aligned multi-modal features. Furthermore, we propose lightweight +yet effective decoder modules, a Shared FPN and a Shared Mask Decoder, which +contribute to the high efficiency of our model. Our simple baseline with a +single encoder achieves outstanding performances on the RIS benchmark datasets +while maintaining computational efficiency, compared to the most recent SoTA +methods based on dual-encoders. + +
+
+ comment: arXiv pre-print +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 68 + +
+
+
+ + ☆ Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at + Any Resolution + + +
+ We present the Qwen2-VL Series, an advanced upgrade of the previous Qwen-VL +models that redefines the conventional predetermined-resolution approach in +visual processing. Qwen2-VL introduces the Naive Dynamic Resolution mechanism, +which enables the model to dynamically process images of varying resolutions +into different numbers of visual tokens. This approach allows the model to +generate more efficient and accurate visual representations, closely aligning +with human perceptual processes. The model also integrates Multimodal Rotary +Position Embedding (M-RoPE), facilitating the effective fusion of positional +information across text, images, and videos. We employ a unified paradigm for +processing both images and videos, enhancing the model's visual perception +capabilities. To explore the potential of large multimodal models, Qwen2-VL +investigates the scaling laws for large vision-language models (LVLMs). By +scaling both the model size-with versions at 2B, 8B, and 72B parameters-and the +amount of training data, the Qwen2-VL Series achieves highly competitive +performance. Notably, the Qwen2-VL-72B model achieves results comparable to +leading models such as GPT-4o and Claude3.5-Sonnet across various multimodal +benchmarks, outperforming other generalist models. Code is available at +\url{https://github.com/QwenLM/Qwen2-VL}. + +
+
+ comment: Code is available at https://github.com/QwenLM/Qwen2-VL +
+
+
+
+
+ + ☆ Qwen2.5-Coder Technical Report + + +
+ In this report, we introduce the Qwen2.5-Coder series, a significant upgrade +from its predecessor, CodeQwen1.5. This series includes two models: +Qwen2.5-Coder-1.5B and Qwen2.5-Coder-7B. As a code-specific model, +Qwen2.5-Coder is built upon the Qwen2.5 architecture and continues pretrained +on a vast corpus of over 5.5 trillion tokens. Through meticulous data cleaning, +scalable synthetic data generation, and balanced data mixing, Qwen2.5-Coder +demonstrates impressive code generation capabilities while retaining general +versatility. The model has been evaluated on a wide range of code-related +tasks, achieving state-of-the-art (SOTA) performance across more than 10 +benchmarks, including code generation, completion, reasoning, and repair, +consistently outperforming larger models of the same model size. We believe +that the release of the Qwen2.5-Coder series will not only push the boundaries +of research in code intelligence but also, through its permissive licensing, +encourage broader adoption by developers in real-world applications. + +
+
+
+
+
+ + ☆ To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic + reasoning + + +
+ Chain-of-thought (CoT) via prompting is the de facto method for eliciting +reasoning capabilities from large language models (LLMs). But for what kinds of +tasks is this extra ``thinking'' really helpful? To analyze this, we conducted +a quantitative meta-analysis covering over 100 papers using CoT and ran our own +evaluations of 20 datasets across 14 models. Our results show that CoT gives +strong performance benefits primarily on tasks involving math or logic, with +much smaller gains on other types of tasks. On MMLU, directly generating the +answer without CoT leads to almost identical accuracy as CoT unless the +question or model's response contains an equals sign, indicating symbolic +operations and reasoning. Following this finding, we analyze the behavior of +CoT on these problems by separating planning and execution and comparing +against tool-augmented LLMs. Much of CoT's gain comes from improving symbolic +execution, but it underperforms relative to using a symbolic solver. Our +results indicate that CoT can be applied selectively, maintaining performance +while saving inference costs. Furthermore, they suggest a need to move beyond +prompt-based CoT to new paradigms that better leverage intermediate computation +across the whole range of LLM applications. + +
+
+
+
+
+ + ☆ Finetuning Language Models to Emit Linguistic Expressions of Uncertainty + + +
+ Large language models (LLMs) are increasingly employed in information-seeking +and decision-making tasks. Despite their broad utility, LLMs tend to generate +information that conflicts with real-world facts, and their persuasive style +can make these inaccuracies appear confident and convincing. As a result, +end-users struggle to consistently align the confidence expressed by LLMs with +the accuracy of their predictions, often leading to either blind trust in all +outputs or a complete disregard for their reliability. In this work, we explore +supervised finetuning on uncertainty-augmented predictions as a method to +develop models that produce linguistic expressions of uncertainty. +Specifically, we measure the calibration of pre-trained models and then +fine-tune language models to generate calibrated linguistic expressions of +uncertainty. Through experiments on various question-answering datasets, we +demonstrate that LLMs are well-calibrated in assessing their predictions, and +supervised finetuning based on the model's own confidence leads to +well-calibrated expressions of uncertainty, particularly for single-claim +answers. + +
+
+
+
+
+ + ☆ You Only Read Once (YORO): Learning to Internalize Database Knowledge + for Text-to-SQL + + +
+ While significant progress has been made on the text-to-SQL task, recent +solutions repeatedly encode the same database schema for every question, +resulting in unnecessary high inference cost and often overlooking crucial +database knowledge. To address these issues, we propose You Only Read Once +(YORO), a novel paradigm that directly internalizes database knowledge into the +parametric knowledge of a text-to-SQL model during training and eliminates the +need for schema encoding during inference. YORO significantly reduces the input +token length by 66%-98%. Despite its shorter inputs, our empirical results +demonstrate YORO's competitive performances with traditional systems on three +benchmarks as well as its significant outperformance on large databases. +Furthermore, YORO excels in handling questions with challenging value +retrievals such as abbreviation. + +
+
+
+
+
+ + ☆ MAgICoRe: Multi-Agent, Iterative, Coarse-to-Fine Refinement for + Reasoning + + +
+ Large Language Models' (LLM) reasoning can be improved using test-time +aggregation strategies, i.e., generating multiple samples and voting among +generated samples. While these improve performance, they often reach a +saturation point. Refinement offers an alternative by using LLM-generated +feedback to improve solution quality. However, refinement introduces 3 key +challenges: (1) Excessive refinement: Uniformly refining all instances can +over-correct and reduce the overall performance. (2) Inability to localize and +address errors: LLMs have a limited ability to self-correct and struggle to +identify and correct their own mistakes. (3) Insufficient refinement: Deciding +how many iterations of refinement are needed is non-trivial, and stopping too +soon could leave errors unaddressed. To tackle these issues, we propose +MAgICoRe, which avoids excessive refinement by categorizing problem difficulty +as easy or hard, solving easy problems with coarse-grained aggregation and hard +ones with fine-grained and iterative multi-agent refinement. To improve error +localization, we incorporate external step-wise reward model (RM) scores. +Moreover, to ensure effective refinement, we employ a multi-agent loop with +three agents: Solver, Reviewer (which generates targeted feedback based on +step-wise RM scores), and the Refiner (which incorporates feedback). To ensure +sufficient refinement, we re-evaluate updated solutions, iteratively initiating +further rounds of refinement. We evaluate MAgICoRe on Llama-3-8B and GPT-3.5 +and show its effectiveness across 5 math datasets. Even one iteration of +MAgICoRe beats Self-Consistency by 3.4%, Best-of-k by 3.2%, and Self-Refine by +4.0% while using less than half the samples. Unlike iterative refinement with +baselines, MAgICoRe continues to improve with more iterations. Finally, our +ablations highlight the importance of MAgICoRe's RMs and multi-agent +communication. + +
+
+ comment: 22 pages, code: https://github.com/dinobby/MAgICoRe +
+
+
+
+
+ + ☆ GRIN: GRadient-INformed MoE + + +
+ Mixture-of-Experts (MoE) models scale more effectively than dense models due +to sparse computation through expert routing, selectively activating only a +small subset of expert modules. However, sparse computation challenges +traditional training practices, as discrete expert routing hinders standard +backpropagation and thus gradient-based optimization, which are the cornerstone +of deep learning. To better pursue the scaling power of MoE, we introduce GRIN +(GRadient-INformed MoE training), which incorporates sparse gradient estimation +for expert routing and configures model parallelism to avoid token dropping. +Applying GRIN to autoregressive language modeling, we develop a top-2 +16$\times$3.8B MoE model. Our model, with only 6.6B activated parameters, +outperforms a 7B dense model and matches the performance of a 14B dense model +trained on the same data. Extensive evaluations across diverse tasks +demonstrate the potential of GRIN to significantly enhance MoE efficacy, +achieving 79.4 on MMLU, 83.7 on HellaSwag, 74.4 on HumanEval, and 58.9 on MATH. + +
+
+ comment: 58 pages +
+
+
+
+
+ + ☆ BERT-VBD: Vietnamese Multi-Document Summarization Framework + + +
+ In tackling the challenge of Multi-Document Summarization (MDS), numerous +methods have been proposed, spanning both extractive and abstractive +summarization techniques. However, each approach has its own limitations, +making it less effective to rely solely on either one. An emerging and +promising strategy involves a synergistic fusion of extractive and abstractive +summarization methods. Despite the plethora of studies in this domain, research +on the combined methodology remains scarce, particularly in the context of +Vietnamese language processing. This paper presents a novel Vietnamese MDS +framework leveraging a two-component pipeline architecture that integrates +extractive and abstractive techniques. The first component employs an +extractive approach to identify key sentences within each document. This is +achieved by a modification of the pre-trained BERT network, which derives +semantically meaningful phrase embeddings using siamese and triplet network +structures. The second component utilizes the VBD-LLaMA2-7B-50b model for +abstractive summarization, ultimately generating the final summary document. +Our proposed framework demonstrates a positive performance, attaining ROUGE-2 +scores of 39.6% on the VN-MDS dataset and outperforming the state-of-the-art +baselines. + +
+
+ comment: 10 pages +
+
+
+
+
+ + ☆ Linguini: A benchmark for language-agnostic linguistic reasoning + + +
+ We propose a new benchmark to measure a language model's linguistic reasoning +skills without relying on pre-existing language-specific knowledge. The test +covers 894 questions grouped in 160 problems across 75 (mostly) extremely +low-resource languages, extracted from the International Linguistic Olympiad +corpus. To attain high accuracy on this benchmark, models don't need previous +knowledge of the tested language, as all the information needed to solve the +linguistic puzzle is presented in the context. We find that, while all analyzed +models rank below 25% accuracy, there is a significant gap between open and +closed models, with the best-performing proprietary model at 24.05% and the +best-performing open model at 8.84%. + +
+
+
+
+
+ + ☆ Qwen2.5-Math Technical Report: Toward Mathematical Expert Model via + Self-Improvement + + +
+ In this report, we present a series of math-specific large language models: +Qwen2.5-Math and Qwen2.5-Math-Instruct-1.5B/7B/72B. The core innovation of the +Qwen2.5 series lies in integrating the philosophy of self-improvement +throughout the entire pipeline, from pre-training and post-training to +inference: (1) During the pre-training phase, Qwen2-Math-Instruct is utilized +to generate large-scale, high-quality mathematical data. (2) In the +post-training phase, we develop a reward model (RM) by conducting massive +sampling from Qwen2-Math-Instruct. This RM is then applied to the iterative +evolution of data in supervised fine-tuning (SFT). With a stronger SFT model, +it's possible to iteratively train and update the RM, which in turn guides the +next round of SFT data iteration. On the final SFT model, we employ the +ultimate RM for reinforcement learning, resulting in the Qwen2.5-Math-Instruct. +(3) Furthermore, during the inference stage, the RM is used to guide sampling, +optimizing the model's performance. + Qwen2.5-Math-Instruct supports both Chinese and English, and possess advanced +mathematical reasoning capabilities, including Chain-of-Thought (CoT) and +Tool-Integrated Reasoning (TIR). We evaluate our models on 10 mathematics +datasets in both English and Chinese, such as GSM8K, MATH, GaoKao, AMC23, and +AIME24, covering a range of difficulties from grade school level to math +competition problems. + +
+
+
+
+
+ + ☆ Low Frame-rate Speech Codec: a Codec Designed for Fast High-quality + Speech LLM Training and Inference ICASSP 2025 + + +
+ Large language models (LLMs) have significantly advanced audio processing +through audio codecs that convert audio into discrete tokens, enabling the +application of language modeling techniques to audio data. However, audio +codecs often operate at high frame rates, resulting in slow training and +inference, especially for autoregressive models. To address this challenge, we +present the Low Frame-rate Speech Codec (LFSC): a neural audio codec that +leverages finite scalar quantization and adversarial training with large speech +language models to achieve high-quality audio compression with a 1.89 kbps +bitrate and 21.5 frames per second. We demonstrate that our novel codec can +make the inference of LLM-based text-to-speech models around three times faster +while improving intelligibility and producing quality comparable to previous +models. + +
+
+ comment: Submitted to ICASSP 2025 +
+
+
+
+
+ + ☆ Measuring Human and AI Values based on Generative Psychometrics with + Large Language Models + + +
+ Human values and their measurement are long-standing interdisciplinary +inquiry. Recent advances in AI have sparked renewed interest in this area, with +large language models (LLMs) emerging as both tools and subjects of value +measurement. This work introduces Generative Psychometrics for Values (GPV), an +LLM-based, data-driven value measurement paradigm, theoretically grounded in +text-revealed selective perceptions. We begin by fine-tuning an LLM for +accurate perception-level value measurement and verifying the capability of +LLMs to parse texts into perceptions, forming the core of the GPV pipeline. +Applying GPV to human-authored blogs, we demonstrate its stability, validity, +and superiority over prior psychological tools. Then, extending GPV to LLM +value measurement, we advance the current art with 1) a psychometric +methodology that measures LLM values based on their scalable and free-form +outputs, enabling context-specific measurement; 2) a comparative analysis of +measurement paradigms, indicating response biases of prior methods; and 3) an +attempt to bridge LLM values and their safety, revealing the predictive power +of different value systems and the impacts of various values on LLM safety. +Through interdisciplinary efforts, we aim to leverage AI for next-generation +psychometrics and psychometrics for value-aligned AI. + +
+
+
+
+
+ + ☆ PARAPHRASUS : A Comprehensive Benchmark for Evaluating Paraphrase + Detection Models + + +
+ The task of determining whether two texts are paraphrases has long been a +challenge in NLP. However, the prevailing notion of paraphrase is often quite +simplistic, offering only a limited view of the vast spectrum of paraphrase +phenomena. Indeed, we find that evaluating models in a paraphrase dataset can +leave uncertainty about their true semantic understanding. To alleviate this, +we release paraphrasus, a benchmark designed for multi-dimensional assessment +of paraphrase detection models and finer model selection. We find that +paraphrase detection models under a fine-grained evaluation lens exhibit +trade-offs that cannot be captured through a single classification dataset. + +
+
+
+
+
+ + ☆ Dual-Layer Training and Decoding of Large Language Model with + Simultaneously Thinking and Speaking + + +
+ Large Language Model can reasonably understand and generate human expressions +but may lack of thorough thinking and reasoning mechanisms. Recently there have +been several studies which enhance the thinking ability of language models but +most of them are not data-driven or training-based. In this paper, we are +motivated by the cognitive mechanism in the natural world, and design a novel +model architecture called TaS which allows it to first consider the thoughts +and then express the response based upon the query. We design several pipelines +to annotate or generate the thought contents from prompt-response samples, then +add language heads in a middle layer which behaves as the thinking layer. We +train the language model by the thoughts-augmented data and successfully let +the thinking layer automatically generate reasonable thoughts and finally +output more reasonable responses. Both qualitative examples and quantitative +results validate the effectiveness and performance of TaS. Our code is +available at https://anonymous.4open.science/r/TadE. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ ASR Benchmarking: Need for a More Representative Conversational Dataset + + +
+ Automatic Speech Recognition (ASR) systems have achieved remarkable +performance on widely used benchmarks such as LibriSpeech and Fleurs. However, +these benchmarks do not adequately reflect the complexities of real-world +conversational environments, where speech is often unstructured and contains +disfluencies such as pauses, interruptions, and diverse accents. In this study, +we introduce a multilingual conversational dataset, derived from TalkBank, +consisting of unstructured phone conversation between adults. Our results show +a significant performance drop across various state-of-the-art ASR models when +tested in conversational settings. Furthermore, we observe a correlation +between Word Error Rate and the presence of speech disfluencies, highlighting +the critical need for more realistic, conversational ASR benchmarks. + +
+
+
+
+
+ + ☆ Sampling Latent Material-Property Information From LLM-Derived Embedding + Representations + + +
+ Vector embeddings derived from large language models (LLMs) show promise in +capturing latent information from the literature. Interestingly, these can be +integrated into material embeddings, potentially useful for data-driven +predictions of materials properties. We investigate the extent to which +LLM-derived vectors capture the desired information and their potential to +provide insights into material properties without additional training. Our +findings indicate that, although LLMs can be used to generate representations +reflecting certain property information, extracting the embeddings requires +identifying the optimal contextual clues and appropriate comparators. Despite +this restriction, it appears that LLMs still have the potential to be useful in +generating meaningful materials-science representations. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Efficacy of Synthetic Data as a Benchmark + + +
+ Large language models (LLMs) have enabled a range of applications in +zero-shot and few-shot learning settings, including the generation of synthetic +datasets for training and testing. However, to reliably use these synthetic +datasets, it is essential to understand how representative they are of +real-world data. We investigate this by assessing the effectiveness of +generating synthetic data through LLM and using it as a benchmark for various +NLP tasks. Our experiments across six datasets, and three different tasks, show +that while synthetic data can effectively capture performance of various +methods for simpler tasks, such as intent classification, it falls short for +more complex tasks like named entity recognition. Additionally, we propose a +new metric called the bias factor, which evaluates the biases introduced when +the same LLM is used to both generate benchmarking data and to perform the +tasks. We find that smaller LLMs exhibit biases towards their own generated +data, whereas larger models do not. Overall, our findings suggest that the +effectiveness of synthetic data as a benchmark varies depending on the task, +and that practitioners should rely on data generated from multiple larger +models whenever possible. + +
+
+
+
+
+ + ☆ LLMs in Education: Novel Perspectives, Challenges, and Opportunities COLING 2025 + + +
+ The role of large language models (LLMs) in education is an increasing area +of interest today, considering the new opportunities they offer for teaching, +learning, and assessment. This cutting-edge tutorial provides an overview of +the educational applications of NLP and the impact that the recent advances in +LLMs have had on this field. We will discuss the key challenges and +opportunities presented by LLMs, grounding them in the context of four major +educational applications: reading, writing, and speaking skills, and +intelligent tutoring systems (ITS). This COLING 2025 tutorial is designed for +researchers and practitioners interested in the educational applications of NLP +and the role LLMs have to play in this area. It is the first of its kind to +address this timely topic. + +
+
+ comment: COLING 2025 Tutorial +
+
+
+
+
+ + ☆ LLMs + Persona-Plug = Personalized LLMs + + +
+ Personalization plays a critical role in numerous language tasks and +applications, since users with the same requirements may prefer diverse outputs +based on their individual interests. This has led to the development of various +personalized approaches aimed at adapting large language models (LLMs) to +generate customized outputs aligned with user preferences. Some of them involve +fine-tuning a unique personalized LLM for each user, which is too expensive for +widespread application. Alternative approaches introduce personalization +information in a plug-and-play manner by retrieving the user's relevant +historical texts as demonstrations. However, this retrieval-based strategy may +break the continuity of the user history and fail to capture the user's overall +styles and patterns, hence leading to sub-optimal performance. To address these +challenges, we propose a novel personalized LLM model, \ours{}. It constructs a +user-specific embedding for each individual by modeling all her historical +contexts through a lightweight plug-in user embedder module. By attaching this +embedding to the task input, LLMs can better understand and capture user habits +and preferences, thereby producing more personalized outputs without tuning +their own parameters. Extensive experiments on various tasks in the language +model personalization (LaMP) benchmark demonstrate that the proposed model +significantly outperforms existing personalized LLM approaches. + +
+
+
+
+
+ + ☆ DocMamba: Efficient Document Pre-training with State Space Model + + +
+ In recent years, visually-rich document understanding has attracted +increasing attention. Transformer-based pre-trained models have become the +mainstream approach, yielding significant performance gains in this field. +However, the self-attention mechanism's quadratic computational complexity +hinders their efficiency and ability to process long documents. In this paper, +we present DocMamba, a novel framework based on the state space model. It is +designed to reduce computational complexity to linear while preserving global +modeling capabilities. To further enhance its effectiveness in document +processing, we introduce the Segment-First Bidirectional Scan (SFBS) to capture +contiguous semantic information. Experimental results demonstrate that DocMamba +achieves new state-of-the-art results on downstream datasets such as FUNSD, +CORD, and SORIE, while significantly improving speed and reducing memory usage. +Notably, experiments on the HRDoc confirm DocMamba's potential for length +extrapolation. The code will be available online. + +
+
+
+
+
+ + ☆ Retrieve, Annotate, Evaluate, Repeat: Leveraging Multimodal LLMs for + Large-Scale Product Retrieval Evaluation + + +
+ Evaluating production-level retrieval systems at scale is a crucial yet +challenging task due to the limited availability of a large pool of +well-trained human annotators. Large Language Models (LLMs) have the potential +to address this scaling issue and offer a viable alternative to humans for the +bulk of annotation tasks. In this paper, we propose a framework for assessing +the product search engines in a large-scale e-commerce setting, leveraging +Multimodal LLMs for (i) generating tailored annotation guidelines for +individual queries, and (ii) conducting the subsequent annotation task. Our +method, validated through deployment on a large e-commerce platform, +demonstrates comparable quality to human annotations, significantly reduces +time and cost, facilitates rapid problem discovery, and provides an effective +solution for production-level quality control at scale. + +
+
+ comment: 13 pages, 5 figures, 4 Tables +
+
+
+
+
+ + ☆ MEOW: MEMOry Supervised LLM Unlearning Via Inverted Facts + + +
+ Large Language Models (LLMs) can memorize sensitive information, raising +concerns about potential misuse. LLM Unlearning, a post-hoc approach to remove +this information from trained LLMs, offers a promising solution to mitigate +these risks. However, previous practices face three key challenges: 1. Utility: +successful unlearning often causes catastrophic collapse on unrelated tasks. 2. +Efficiency: many methods either involve adding similarly sized models, which +slows down unlearning or inference, or require retain data that are difficult +to obtain. 3. Robustness: even effective methods may still leak data via +extraction techniques. To address these challenges, we propose MEOW, a simple +yet effective gradient descent-based unlearning method. Specifically, we use an +offline LLM to generate a set of inverted facts. Then, we design a new metric, +MEMO, to quantify memorization in LLMs. Finally, based on the signals provided +by MEMO, we select the most appropriate set of inverted facts and finetune the +model based on them. We evaluate MEOW on the commonly used unlearn benchmark, +ToFU, with Llama2-7B-Chat and Phi-1.5B, and test it on both NLU and NLG tasks. +Results demonstrate significant improvement of MEOW in forget quality without +substantial loss in model utility. Meanwhile, MEOW does not exhibit significant +degradation in NLU or NLG capabilities, and there is even a slight improvement +in NLU performance. + +
+
+
+
+
+ + ☆ Extract-and-Abstract: Unifying Extractive and Abstractive Summarization + within Single Encoder-Decoder Framework + + +
+ Extract-then-Abstract is a naturally coherent paradigm to conduct abstractive +summarization with the help of salient information identified by the extractive +model. Previous works that adopt this paradigm train the extractor and +abstractor separately and introduce extra parameters to highlight the extracted +salients to the abstractor, which results in error accumulation and additional +training costs. In this paper, we first introduce a parameter-free highlight +method into the encoder-decoder framework: replacing the encoder attention mask +with a saliency mask in the cross-attention module to force the decoder to +focus only on salient parts of the input. A preliminary analysis compares +different highlight methods, demonstrating the effectiveness of our saliency +mask. We further propose the novel extract-and-abstract paradigm, ExtAbs, which +jointly and seamlessly performs Extractive and Abstractive summarization tasks +within single encoder-decoder model to reduce error accumulation. In ExtAbs, +the vanilla encoder is augmented to extract salients, and the vanilla decoder +is modified with the proposed saliency mask to generate summaries. Built upon +BART and PEGASUS, experiments on three datasets show that ExtAbs can achieve +superior performance than baselines on the extractive task and performs +comparable, or even better than the vanilla models on the abstractive task. + +
+
+
+
+
+ + ☆ The Factuality of Large Language Models in the Legal Domain CIKM 2024 + + +
+ This paper investigates the factuality of large language models (LLMs) as +knowledge bases in the legal domain, in a realistic usage scenario: we allow +for acceptable variations in the answer, and let the model abstain from +answering when uncertain. First, we design a dataset of diverse factual +questions about case law and legislation. We then use the dataset to evaluate +several LLMs under different evaluation methods, including exact, alias, and +fuzzy matching. Our results show that the performance improves significantly +under the alias and fuzzy matching methods. Further, we explore the impact of +abstaining and in-context examples, finding that both strategies enhance +precision. Finally, we demonstrate that additional pre-training on legal +documents, as seen with SaulLM, further improves factual precision from 63% to +81%. + +
+
+ comment: CIKM 2024, short paper +
+
+
+
+
+ + ☆ Enabling Real-Time Conversations with Minimal Training Costs + + +
+ Large language models (LLMs) have demonstrated the ability to improve human +efficiency through conversational interactions. Conventional LLM-powered +dialogue systems, operating on a turn-based paradigm, preclude real-time +interaction during response generation. To address this limitation, researchers +have proposed duplex models. These models can dynamically adapt to user input, +facilitating real-time interactive feedback. However, these methods typically +require substantial computational resources to acquire the ability. To reduce +overhead, this paper presents a new duplex decoding approach that enhances LLMs +with duplex ability, requiring minimal additional training. Specifically, our +method employs parallel decoding of queries and responses in conversations, +effectively implementing a channel-division-multiplexing decoding strategy. +Experimental results indicate that our proposed method significantly enhances +the naturalness and human-likeness of user-AI interactions with minimal +training costs. + +
+
+ comment: 7pages, 6 figures, 1 table +
+
+
+
+
+ + ☆ Revealing the Challenge of Detecting Character Knowledge Errors in LLM + Role-Playing + + +
+ Large language model (LLM) role-playing has gained widespread attention, +where the authentic character knowledge is crucial for constructing realistic +LLM role-playing agents. However, existing works usually overlook the +exploration of LLMs' ability to detect characters' known knowledge errors (KKE) +and unknown knowledge errors (UKE) while playing roles, which would lead to +low-quality automatic construction of character trainable corpus. In this +paper, we propose a probing dataset to evaluate LLMs' ability to detect errors +in KKE and UKE. The results indicate that even the latest LLMs struggle to +effectively detect these two types of errors, especially when it comes to +familiar knowledge. We experimented with various reasoning strategies and +propose an agent-based reasoning method, Self-Recollection and Self-Doubt +(S2RD), to further explore the potential for improving error detection +capabilities. Experiments show that our method effectively improves the LLMs' +ability to detect error character knowledge, but it remains an issue that +requires ongoing attention. + +
+
+ comment: 22 pages, 14 figures +
+
+
+
+
+ + ☆ TART: An Open-Source Tool-Augmented Framework for Explainable + Table-based Reasoning + + +
+ Current Large Language Models (LLMs) exhibit limited ability to understand +table structures and to apply precise numerical reasoning, which is crucial for +tasks such as table question answering (TQA) and table-based fact verification +(TFV). To address these challenges, we introduce our Tool-Augmented Reasoning +framework for Tables (TART), which integrates LLMs with specialized tools. TART +contains three key components: a table formatter to ensure accurate data +representation, a tool maker to develop specific computational tools, and an +explanation generator to maintain explainability. We also present the TOOLTAB +dataset, a new benchmark designed specifically for training LLMs in table-tool +integration. Our experiments indicate that TART achieves substantial +improvements over existing methods (e.g., Chain-of-Thought) by improving both +the precision of data processing and the clarity of the reasoning process. +Notably, TART paired with CodeLlama achieves 90.0% of the accuracy of the +closed-sourced LLM GPT-3.5-turbo, highlighting its robustness in diverse +real-world scenarios. All the code and data are available at +https://github.com/XinyuanLu00/TART. + +
+
+ comment: technical report +
+
+
+
+
+ + ☆ From Lists to Emojis: How Format Bias Affects Model Alignment + + +
+ In this paper, we study format biases in reinforcement learning from human +feedback (RLHF). We observe that many widely-used preference models, including +human evaluators, GPT-4, and top-ranking models on the RewardBench benchmark, +exhibit strong biases towards specific format patterns, such as lists, links, +bold text, and emojis. Furthermore, large language models (LLMs) can exploit +these biases to achieve higher rankings on popular benchmarks like AlpacaEval +and LMSYS Chatbot Arena. One notable example of this is verbosity bias, where +current preference models favor longer responses that appear more +comprehensive, even when their quality is equal to or lower than shorter, +competing responses. However, format biases beyond verbosity remain largely +underexplored in the literature. In this work, we extend the study of biases in +preference learning beyond the commonly recognized length bias, offering a +comprehensive analysis of a wider range of format biases. Additionally, we show +that with a small amount of biased data (less than 1%), we can inject +significant bias into the reward model. Moreover, these format biases can also +be easily exploited by downstream alignment algorithms, such as best-of-n +sampling and online iterative DPO, as it is usually easier to manipulate the +format than to improve the quality of responses. Our findings emphasize the +need to disentangle format and content both for designing alignment algorithms +and evaluating models. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ Harnessing LLMs for API Interactions: A Framework for Classification and + Synthetic Data Generation + + +
+ As Large Language Models (LLMs) advance in natural language processing, there +is growing interest in leveraging their capabilities to simplify software +interactions. In this paper, we propose a novel system that integrates LLMs for +both classifying natural language inputs into corresponding API calls and +automating the creation of sample datasets tailored to specific API functions. +By classifying natural language commands, our system allows users to invoke +complex software functionalities through simple inputs, improving interaction +efficiency and lowering the barrier to software utilization. Our dataset +generation approach also enables the efficient and systematic evaluation of +different LLMs in classifying API calls, offering a practical tool for +developers or business owners to assess the suitability of LLMs for customized +API management. We conduct experiments on several prominent LLMs using +generated sample datasets for various API functions. The results show that +GPT-4 achieves a high classification accuracy of 0.996, while LLaMA-3-8B +performs much worse at 0.759. These findings highlight the potential of LLMs to +transform API management and validate the effectiveness of our system in +guiding model testing and selection across diverse applications. + +
+
+
+
+
+ + ☆ FLARE: Fusing Language Models and Collaborative Architectures for + Recommender Enhancement + + +
+ Hybrid recommender systems, combining item IDs and textual descriptions, +offer potential for improved accuracy. However, previous work has largely +focused on smaller datasets and model architectures. This paper introduces +Flare (Fusing Language models and collaborative Architectures for Recommender +Enhancement), a novel hybrid recommender that integrates a language model (mT5) +with a collaborative filtering model (Bert4Rec) using a Perceiver network. This +architecture allows Flare to effectively combine collaborative and content +information for enhanced recommendations. + We conduct a two-stage evaluation, first assessing Flare's performance +against established baselines on smaller datasets, where it demonstrates +competitive accuracy. Subsequently, we evaluate Flare on a larger, more +realistic dataset with a significantly larger item vocabulary, introducing new +baselines for this setting. Finally, we showcase Flare's inherent ability to +support critiquing, enabling users to provide feedback and refine +recommendations. We further leverage critiquing as an evaluation method to +assess the model's language understanding and its transferability to the +recommendation task. + +
+
+
+
+
+ + ☆ Enhancing Complex Formula Recognition with Hierarchical Detail-Focused + Network ICASSP 2025 + + +
+ Hierarchical and complex Mathematical Expression Recognition (MER) is +challenging due to multiple possible interpretations of a formula, complicating +both parsing and evaluation. In this paper, we introduce the Hierarchical +Detail-Focused Recognition dataset (HDR), the first dataset specifically +designed to address these issues. It consists of a large-scale training set, +HDR-100M, offering an unprecedented scale and diversity with one hundred +million training instances. And the test set, HDR-Test, includes multiple +interpretations of complex hierarchical formulas for comprehensive model +performance evaluation. Additionally, the parsing of complex formulas often +suffers from errors in fine-grained details. To address this, we propose the +Hierarchical Detail-Focused Recognition Network (HDNet), an innovative +framework that incorporates a hierarchical sub-formula module, focusing on the +precise handling of formula details, thereby significantly enhancing MER +performance. Experimental results demonstrate that HDNet outperforms existing +MER models across various datasets. + +
+
+ comment: Submitted to the 2025 IEEE International Conference on Acoustics, + Speech, and Signal Processing (ICASSP 2025) +
+
+
+
+
+ + ☆ RUIE: Retrieval-based Unified Information Extraction using Large + Language Model + + +
+ Unified information extraction (UIE) aims to complete all information +extraction tasks using a single model or framework. While previous work has +primarily focused on instruction-tuning large language models (LLMs) with +constructed datasets, these methods require significant computational resources +and struggle to generalize to unseen tasks. To address these limitations, we +propose RUIE (Retrieval-based Unified Information Extraction), a framework that +leverages in-context learning to enable rapid generalization while reducing +computational costs. The key challenge in RUIE is selecting the most beneficial +demonstrations for LLMs to effectively handle diverse IE tasks. To achieve +this, we integrate LLM preferences for ranking candidate demonstrations and +design a keyword-enhanced reward model to capture fine-grained relationships +between queries and demonstrations. We then train a bi-encoder retriever for +UIE through contrastive learning and knowledge distillation. To the best of our +knowledge, RUIE is the first trainable retrieval framework for UIE. +Experimental results on 8 held-out datasets demonstrate RUIE's effectiveness in +generalizing to unseen tasks, with average F1-score improvements of 19.22 and +3.13 compared to instruction-tuning methods and other retrievers, respectively. +Further analysis confirms RUIE's adaptability to LLMs of varying sizes and the +importance of its key components. + +
+
+ comment: 14 pages, 3 figures +
+
+
+
+
+ + ☆ BanStereoSet: A Dataset to Measure Stereotypical Social Biases in LLMs + for Bangla + + +
+ This study presents BanStereoSet, a dataset designed to evaluate +stereotypical social biases in multilingual LLMs for the Bangla language. In an +effort to extend the focus of bias research beyond English-centric datasets, we +have localized the content from the StereoSet, IndiBias, and Kamruzzaman et. +al.'s datasets, producing a resource tailored to capture biases prevalent +within the Bangla-speaking community. Our BanStereoSet dataset consists of +1,194 sentences spanning 9 categories of bias: race, profession, gender, +ageism, beauty, beauty in profession, region, caste, and religion. This dataset +not only serves as a crucial tool for measuring bias in multilingual LLMs but +also facilitates the exploration of stereotypical bias across different social +categories, potentially guiding the development of more equitable language +technologies in Bangladeshi contexts. Our analysis of several language models +using this dataset indicates significant biases, reinforcing the necessity for +culturally and linguistically adapted datasets to develop more equitable +language technologies. + +
+
+
+
+
+ + ☆ "A Woman is More Culturally Knowledgeable than A Man?": The Effect of + Personas on Cultural Norm Interpretation in LLMs + + +
+ As the deployment of large language models (LLMs) expands, there is an +increasing demand for personalized LLMs. One method to personalize and guide +the outputs of these models is by assigning a persona -- a role that describes +the expected behavior of the LLM (e.g., a man, a woman, an engineer). This +study investigates whether an LLM's understanding of social norms varies across +assigned personas. Ideally, the perception of a social norm should remain +consistent regardless of the persona, since acceptability of a social norm +should be determined by the region the norm originates from, rather than by +individual characteristics such as gender, body size, or race. A norm is +universal within its cultural context. In our research, we tested 36 distinct +personas from 12 sociodemographic categories (e.g., age, gender, beauty) across +four different LLMs. We find that LLMs' cultural norm interpretation varies +based on the persona used and the norm interpretation also varies within a +sociodemographic category (e.g., a fat person and a thin person as in physical +appearance group) where an LLM with the more socially desirable persona (e.g., +a thin person) interprets social norms more accurately than with the less +socially desirable persona (e.g., a fat person). We also discuss how different +types of social biases may contribute to the results that we observe. + +
+
+ comment: Preprint, Under Review +
+
+
+
+
+ + ♻ ☆ NovAScore: A New Automated Metric for Evaluating Document Level Novelty + + +
+ The rapid expansion of online content has intensified the issue of +information redundancy, underscoring the need for solutions that can identify +genuinely new information. Despite this challenge, the research community has +seen a decline in focus on novelty detection, particularly with the rise of +large language models (LLMs). Additionally, previous approaches have relied +heavily on human annotation, which is time-consuming, costly, and particularly +challenging when annotators must compare a target document against a vast +number of historical documents. In this work, we introduce NovAScore (Novelty +Evaluation in Atomicity Score), an automated metric for evaluating +document-level novelty. NovAScore aggregates the novelty and salience scores of +atomic information, providing high interpretability and a detailed analysis of +a document's novelty. With its dynamic weight adjustment scheme, NovAScore +offers enhanced flexibility and an additional dimension to assess both the +novelty level and the importance of information within a document. Our +experiments show that NovAScore strongly correlates with human judgments of +novelty, achieving a 0.626 Point-Biserial correlation on the TAP-DLND 1.0 +dataset and a 0.920 Pearson correlation on an internal human-annotated dataset. + +
+
+
+
+
+ + ♻ ☆ ReflectDiffu:Reflect between Emotion-intent Contagion and Mimicry for + Empathetic Response Generation via a RL-Diffusion Framework + + +
+ Empathetic response generation necessitates the integration of emotional and +intentional dynamics to foster meaningful interactions. Existing research +either neglects the intricate interplay between emotion and intent, leading to +suboptimal controllability of empathy, or resorts to large language models +(LLMs), which incur significant computational overhead. In this paper, we +introduce ReflectDiffu, a lightweight and comprehensive framework for +empathetic response generation. This framework incorporates emotion contagion +to augment emotional expressiveness and employs an emotion-reasoning mask to +pinpoint critical emotional elements. Additionally, it integrates intent +mimicry within reinforcement learning for refinement during diffusion. By +harnessing an intent twice reflect the mechanism of +Exploring-Sampling-Correcting, ReflectDiffu adeptly translates emotional +decision-making into precise intent actions, thereby addressing empathetic +response misalignments stemming from emotional misrecognition. Through +reflection, the framework maps emotional states to intents, markedly enhancing +both response empathy and flexibility. Comprehensive experiments reveal that +ReflectDiffu outperforms existing models regarding relevance, controllability, +and informativeness, achieving state-of-the-art results in both automatic and +human evaluations. + +
+
+
+
+
+ + ♻ ☆ Visualizing Temporal Topic Embeddings with a Compass + + +
+ Dynamic topic modeling is useful at discovering the development and change in +latent topics over time. However, present methodology relies on algorithms that +separate document and word representations. This prevents the creation of a +meaningful embedding space where changes in word usage and documents can be +directly analyzed in a temporal context. This paper proposes an expansion of +the compass-aligned temporal Word2Vec methodology into dynamic topic modeling. +Such a method allows for the direct comparison of word and document embeddings +across time in dynamic topics. This enables the creation of visualizations that +incorporate temporal word embeddings within the context of documents into topic +visualizations. In experiments against the current state-of-the-art, our +proposed method demonstrates overall competitive performance in topic relevancy +and diversity across temporal datasets of varying size. Simultaneously, it +provides insightful visualizations focused on temporal word embeddings while +maintaining the insights provided by global topic evolution, advancing our +understanding of how topics evolve over time. + +
+
+ comment: 11 pages, 9 figures, conference paper +
+
+
+
+
+ + ♻ ☆ PlaSma: Making Small Language Models Better Procedural Knowledge Models + for (Counterfactual) Planning ICLR 2024 + + +
+ Procedural planning, which entails decomposing a high-level goal into a +sequence of temporally ordered steps, is an important yet intricate task for +machines. It involves integrating common-sense knowledge to reason about +complex and often contextualized situations, e.g. ``scheduling a doctor's +appointment without a phone''. While current approaches show encouraging +results using large language models (LLMs), they are hindered by drawbacks such +as costly API calls and reproducibility issues. In this paper, we advocate +planning using smaller language models. We present PlaSma, a novel two-pronged +approach to endow small language models with procedural knowledge and +(constrained) language planning capabilities. More concretely, we develop +symbolic procedural knowledge distillation to enhance the commonsense knowledge +in small language models and an inference-time algorithm to facilitate more +structured and accurate reasoning. In addition, we introduce a new related +task, Replanning, that requires a revision of a plan to cope with a constrained +situation. In both the planning and replanning settings, we show that +orders-of-magnitude smaller models (770M-11B parameters) can compete and often +surpass their larger teacher models' capabilities. Finally, we showcase +successful application of PlaSma in an embodied environment, VirtualHome. + +
+
+ comment: ICLR 2024 version , 31 pages +
+
+
+
+
+ + ♻ ☆ A New Era in Computational Pathology: A Survey on Foundation and + Vision-Language Models + + +
+ Recent advances in deep learning have completely transformed the domain of +computational pathology (CPath). More specifically, it has altered the +diagnostic workflow of pathologists by integrating foundation models (FMs) and +vision-language models (VLMs) in their assessment and decision-making process. +The limitations of existing deep learning approaches in CPath can be overcome +by FMs through learning a representation space that can be adapted to a wide +variety of downstream tasks without explicit supervision. Deploying VLMs allow +pathology reports written in natural language be used as rich semantic +information sources to improve existing models as well as generate predictions +in natural language form. In this survey, a holistic and systematic overview of +recent innovations in FMs and VLMs in CPath is presented. Furthermore, the +tools, datasets and training schemes for these models are summarized in +addition to categorizing them into distinct groups. This extensive survey +highlights the current trends in CPath and its possible revolution through the +use of FMs and VLMs in the future. + +
+
+ comment: 20 pages, 19 figures and 9 tables +
+
+
+
+
+ + ♻ ☆ Measuring Dimensions of Self-Presentation in Twitter Bios and their + Links to Misinformation Sharing + + +
+ Social media platforms provide users with a profile description field, +commonly known as a ``bio," where they can present themselves to the world. A +growing literature shows that text in these bios can improve our understanding +of online self-presentation and behavior, but existing work relies exclusively +on keyword-based approaches to do so. We here propose and evaluate a suite of +\hl{simple, effective, and theoretically motivated} approaches to embed bios in +spaces that capture salient dimensions of social meaning, such as age and +partisanship. We \hl{evaluate our methods on four tasks, showing that the +strongest one out-performs several practical baselines.} We then show the +utility of our method in helping understand associations between +self-presentation and the sharing of URLs from low-quality news sites on +Twitter\hl{, with a particular focus on explore the interactions between age +and partisanship, and exploring the effects of self-presentations of +religiosity}. Our work provides new tools to help computational social +scientists make use of information in bios, and provides new insights into how +misinformation sharing may be perceived on Twitter. + +
+
+
+
+
+ + ♻ ☆ A Hybrid Transformer and Attention Based Recurrent Neural Network for + Robust and Interpretable Sentiment Analysis of Tweets + + +
+ Sentiment analysis is crucial for understanding public opinion and consumer +behavior. Existing models face challenges with linguistic diversity, +generalizability, and explainability. We propose TRABSA, a hybrid framework +integrating transformer-based architectures, attention mechanisms, and BiLSTM +networks to address this. Leveraging RoBERTa-trained on 124M tweets, we bridge +gaps in sentiment analysis benchmarks, ensuring state-of-the-art accuracy. +Augmenting datasets with tweets from 32 countries and US states, we compare six +word-embedding techniques and three lexicon-based labeling techniques, +selecting the best for optimal sentiment analysis. TRABSA outperforms +traditional ML and deep learning models with 94% accuracy and significant +precision, recall, and F1-score gains. Evaluation across diverse datasets +demonstrates consistent superiority and generalizability. SHAP and LIME +analyses enhance interpretability, improving confidence in predictions. Our +study facilitates pandemic resource management, aiding resource planning, +policy formation, and vaccination tactics. + +
+
+
+
+
+ + ♻ ☆ Creative Beam Search: LLM-as-a-Judge For Improving Response Generation + + +
+ Large language models are revolutionizing several areas, including artificial +creativity. However, the process of generation in machines profoundly diverges +from that observed in humans. In particular, machine generation is +characterized by a lack of intentionality and an underlying creative process. +We propose a method called Creative Beam Search that uses Diverse Beam Search +and LLM-as-a-Judge to perform response generation and response validation. The +results of a qualitative experiment show how our approach can provide better +output than standard sampling techniques. We also show that the response +validation step is a necessary complement to the response generation step. + +
+
+ comment: Presented as a short paper at the 15th International Conference on + Computational Creativity (ICCC'24) +
+
+
+
+
+ + ♻ ☆ Language Models and Retrieval Augmented Generation for Automated + Structured Data Extraction from Diagnostic Reports + + +
+ Purpose: To develop and evaluate an automated system for extracting +structured clinical information from unstructured radiology and pathology +reports using open-weights large language models (LMs) and retrieval augmented +generation (RAG), and to assess the effects of model configuration variables on +extraction performance. Methods and Materials: The study utilized two datasets: +7,294 radiology reports annotated for Brain Tumor Reporting and Data System +(BT-RADS) scores and 2,154 pathology reports annotated for isocitrate +dehydrogenase (IDH) mutation status. An automated pipeline was developed to +benchmark the performance of various LMs and RAG configurations. The impact of +model size, quantization, prompting strategies, output formatting, and +inference parameters was systematically evaluated. Results: The best performing +models achieved over 98% accuracy in extracting BT-RADS scores from radiology +reports and over 90% for IDH mutation status extraction from pathology reports. +The top model being medical fine-tuned llama3. Larger, newer, and domain +fine-tuned models consistently outperformed older and smaller models. Model +quantization had minimal impact on performance. Few-shot prompting +significantly improved accuracy. RAG improved performance for complex pathology +reports but not for shorter radiology reports. Conclusions: Open LMs +demonstrate significant potential for automated extraction of structured +clinical data from unstructured clinical reports with local privacy-preserving +application. Careful model selection, prompt engineering, and semi-automated +optimization using annotated data are critical for optimal performance. These +approaches could be reliable enough for practical use in research workflows, +highlighting the potential for human-machine collaboration in healthcare data +extraction. + +
+
+
+
+
+ + ♻ ☆ On the Creativity of Large Language Models + + +
+ Large Language Models (LLMs) are revolutionizing several areas of Artificial +Intelligence. One of the most remarkable applications is creative writing, +e.g., poetry or storytelling: the generated outputs are often of astonishing +quality. However, a natural question arises: can LLMs be really considered +creative? In this article, we first analyze the development of LLMs under the +lens of creativity theories, investigating the key open questions and +challenges. In particular, we focus our discussion on the dimensions of value, +novelty, and surprise as proposed by Margaret Boden in her work. Then, we +consider different classic perspectives, namely product, process, press, and +person. We discuss a set of ``easy'' and ``hard'' problems in machine +creativity, presenting them in relation to LLMs. Finally, we examine the +societal impact of these technologies with a particular focus on the creative +industries, analyzing the opportunities offered, the challenges arising from +them, and the potential associated risks, from both legal and ethical points of +view. + +
+
+
+
+
+ + ♻ ☆ RetrievalAttention: Accelerating Long-Context LLM Inference via Vector + Retrieval + + +
+ Transformer-based Large Language Models (LLMs) have become increasingly +important. However, due to the quadratic time complexity of attention +computation, scaling LLMs to longer contexts incurs extremely slow inference +latency and high GPU memory consumption for caching key-value (KV) vectors. +This paper proposes RetrievalAttention, a training-free approach to both +accelerate attention computation and reduce GPU memory consumption. By +leveraging the dynamic sparsity of attention mechanism, RetrievalAttention +proposes to use approximate nearest neighbor search (ANNS) indexes for KV +vectors in CPU memory and retrieves the most relevant ones with vector search +during generation. Unfortunately, we observe that the off-the-shelf ANNS +indexes are often ineffective for such retrieval tasks due to the +out-of-distribution (OOD) between query vectors and key vectors in attention +mechanism. RetrievalAttention addresses the OOD challenge by designing an +attention-aware vector search algorithm that can adapt to the distribution of +query vectors. Our evaluation shows that RetrievalAttention only needs to +access 1--3% of data while maintaining high model accuracy. This leads to +significant reduction in the inference cost of long-context LLMs with much +lower GPU memory footprint. In particular, RetrievalAttention only needs a +single NVIDIA RTX4090 (24GB) for serving 128K tokens in LLMs with 8B +parameters, which is capable of generating one token in 0.188 seconds. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ A Robust Autoencoder Ensemble-Based Approach for Anomaly Detection in + Text + + +
+ Anomaly detection (AD) is a fast growing and popular domain among established +applications like vision and time series. We observe a rich literature for +these applications, but anomaly detection in text is only starting to blossom. +Recently, self-supervised methods with self-attention mechanism have been the +most popular choice. While recent works have proposed a working ground for +building and benchmarking state of the art approaches, we propose two principal +contributions in this paper: contextual anomaly contamination and a novel +ensemble-based approach. Our method, Textual Anomaly Contamination (TAC), +allows to contaminate inlier classes with either independent or contextual +anomalies. In the literature, it appears that this distinction is not +performed. For finding contextual anomalies, we propose RoSAE, a Robust +Subspace Local Recovery Autoencoder Ensemble. All autoencoders of the ensemble +present a different latent representation through local manifold learning. +Benchmark shows that our approach outperforms recent works on both independent +and contextual anomalies, while being more robust. We also provide 8 dataset +comparison instead of only relying to Reuters and 20 Newsgroups corpora. + +
+
+
+
+
+ + ♻ ☆ GEIC: Universal and Multilingual Named Entity Recognition with Large + Language Models + + +
+ Large Language Models (LLMs) have supplanted traditional methods in numerous +natural language processing tasks. Nonetheless, in Named Entity Recognition +(NER), existing LLM-based methods underperform compared to baselines and +require significantly more computational resources, limiting their application. +In this paper, we introduce the task of generation-based extraction and +in-context classification (GEIC), designed to leverage LLMs' prior knowledge +and self-attention mechanisms for NER tasks. We then propose CascadeNER, a +universal and multilingual GEIC framework for few-shot and zero-shot NER. +CascadeNER employs model cascading to utilize two small-parameter LLMs to +extract and classify independently, reducing resource consumption while +enhancing accuracy. We also introduce AnythingNER, the first NER dataset +specifically designed for LLMs, including 8 languages, 155 entity types and a +novel dynamic categorization system. Experiments show that CascadeNER achieves +state-of-the-art performance on low-resource and fine-grained scenarios, +including CrossNER and FewNERD. Our work is openly accessible. + +
+
+
+
+
+ + ♻ ☆ Reconciling Kaplan and Chinchilla Scaling Laws + + +
+ Kaplan et al. [2020] (`Kaplan') and Hoffmann et al. [2022] (`Chinchilla') +studied the scaling behavior of transformers trained on next-token language +prediction. These studies produced different estimates for how the number of +parameters ($N$) and training tokens ($D$) should be set to achieve the lowest +possible loss for a given compute budget ($C$). Kaplan: $N_\text{optimal} +\propto C^{0.73}$, Chinchilla: $N_\text{optimal} \propto C^{0.50}$. This paper +finds that much of this discrepancy can be attributed to Kaplan counting +non-embedding rather than total parameters, combined with their analysis being +performed at small scale. Simulating the Chinchilla study under these +conditions produces biased scaling coefficients close to Kaplan's. Hence, this +paper reaffirms Chinchilla's scaling coefficients, by explaining the primary +cause of Kaplan's original overestimation. As a second contribution, the paper +explains differences in the reported relationships between loss and compute. +These findings lead us to recommend that future scaling studies use total +parameters and compute. + +
+
+
+
+
+ + ♻ ☆ Neural Semantic Parsing with Extremely Rich Symbolic Meaning + Representations + + +
+ Current open-domain neural semantics parsers show impressive performance. +However, closer inspection of the symbolic meaning representations they produce +reveals significant weaknesses: sometimes they tend to merely copy character +sequences from the source text to form symbolic concepts, defaulting to the +most frequent word sense based in the training distribution. By leveraging the +hierarchical structure of a lexical ontology, we introduce a novel +compositional symbolic representation for concepts based on their position in +the taxonomical hierarchy. This representation provides richer semantic +information and enhances interpretability. We introduce a neural "taxonomical" +semantic parser to utilize this new representation system of predicates, and +compare it with a standard neural semantic parser trained on the traditional +meaning representation format, employing a novel challenge set and evaluation +metric for evaluation. Our experimental findings demonstrate that the +taxonomical model, trained on much richer and complex meaning representations, +is slightly subordinate in performance to the traditional model using the +standard metrics for evaluation, but outperforms it when dealing with +out-of-vocabulary concepts. This finding is encouraging for research in +computational semantics that aims to combine data-driven distributional +meanings with knowledge-based symbolic representations. + +
+
+ comment: This manuscript has been accepted by Computational Linguistics + journal on 2024-09-07 +
+
+
+
+
+ + ♻ ☆ Internal Consistency and Self-Feedback in Large Language Models: A + Survey + + +
+ Large language models (LLMs) often exhibit deficient reasoning or generate +hallucinations. To address these, studies prefixed with "Self-" such as +Self-Consistency, Self-Improve, and Self-Refine have been initiated. They share +a commonality: involving LLMs evaluating and updating themselves. Nonetheless, +these efforts lack a unified perspective on summarization, as existing surveys +predominantly focus on categorization. + In this paper, we use a unified perspective of internal consistency, offering +explanations for reasoning deficiencies and hallucinations. Internal +consistency refers to the consistency in expressions among LLMs' latent, +decoding, or response layers based on sampling methodologies. Then, we +introduce an effective theoretical framework capable of mining internal +consistency, named Self-Feedback. This framework consists of two modules: +Self-Evaluation and Self-Update. The former captures internal consistency +signals, while the latter leverages the signals to enhance either the model's +response or the model itself. This framework has been employed in numerous +studies. + We systematically classify these studies by tasks and lines of work; +summarize relevant evaluation methods and benchmarks; and delve into the +concern, "Does Self-Feedback Really Work?" We also propose several critical +viewpoints, including the "Hourglass Evolution of Internal Consistency", +"Consistency Is (Almost) Correctness" hypothesis, and "The Paradox of Latent +and Explicit Reasoning". The relevant resources are open-sourced at +https://github.com/IAAR-Shanghai/ICSFSurvey. + +
+
+ comment: 20 pages, 10 figures, 6 tables, 13 equations +
+
+
+
+
+ + ♻ ☆ AugTriever: Unsupervised Dense Retrieval by Scalable Data Augmentation + + +
+ Dense retrievers have made significant strides in text retrieval and +open-domain question answering. However, most of these achievements have relied +heavily on extensive human-annotated supervision. In this study, we aim to +develop unsupervised methods for improving dense retrieval models. We propose +two approaches that enable annotation-free and scalable training by creating +pseudo querydocument pairs: query extraction and transferred query generation. +The query extraction method involves selecting salient spans from the original +document to generate pseudo queries. On the other hand, the transferred query +generation method utilizes generation models trained for other NLP tasks, such +as summarization, to produce pseudo queries. Through extensive experimentation, +we demonstrate that models trained using these augmentation methods can achieve +comparable, if not better, performance than multiple strong dense baselines. +Moreover, combining these strategies leads to further improvements, resulting +in superior performance of unsupervised dense retrieval, unsupervised domain +adaptation and supervised finetuning, benchmarked on both BEIR and ODQA +datasets. Code and datasets are publicly available at +https://github.com/salesforce/AugTriever. + +
+
+ comment: DCAI24, October 25, 2024, Boise, ID +
+
+
+
+
+ + ♻ ☆ Show Less, Instruct More: Enriching Prompts with Definitions and + Guidelines for Zero-Shot NER + + +
+ Recently, several specialized instruction-tuned Large Language Models (LLMs) +for Named Entity Recognition (NER) have emerged. Compared to traditional NER +approaches, these models have demonstrated strong generalization capabilities. +Existing LLMs primarily focus on addressing zero-shot NER on Out-of-Domain +inputs, while fine-tuning on an extensive number of entity classes that often +highly or completely overlap with test sets. In this work instead, we propose +SLIMER, an approach designed to tackle never-seen-before entity tags by +instructing the model on fewer examples, and by leveraging a prompt enriched +with definition and guidelines. Experiments demonstrate that definition and +guidelines yield better performance, faster and more robust learning, +particularly when labelling unseen named entities. Furthermore, SLIMER performs +comparably to state-of-the-art approaches in out-of-domain zero-shot NER, while +being trained in a more fair, though certainly more challenging, setting. + +
+
+
+
+
+ + ♻ ☆ Leveraging Large Language Models for Solving Rare MIP Challenges + + +
+ Mixed Integer Programming (MIP) has been extensively applied in areas +requiring mathematical solvers to address complex instances within tight time +constraints. However, as the problem scale increases, the complexity of model +formulation and finding feasible solutions escalates significantly. In +contrast, the model-building cost for end-to-end models, such as large language +models (LLMs), remains largely unaffected by problem scale due to their pattern +recognition capabilities. While LLMs, like GPT-4, without fine-tuning, can +handle some traditional medium-scale MIP problems, they struggle with uncommon +or highly specialized MIP scenarios. Fine-tuning LLMs can yield some feasible +solutions for medium-scale MIP instances, but these models typically fail to +explore diverse solutions when constrained by a low and constant temperature, +limiting their performance. In this paper, we propose and evaluate a +recursively dynamic temperature method integrated with a chain-of-thought +approach. Our findings show that starting with a high temperature and gradually +lowering it leads to better feasible solutions compared to other dynamic +temperature strategies. Additionally, by comparing results generated by the LLM +with those from Gurobi, we demonstrate that the LLM can produce solutions that +complement traditional solvers by accelerating the pruning process and +improving overall efficiency. + +
+
+
+
+
+ + ♻ ☆ Exploring Fine-tuned Generative Models for Keyphrase Selection: A Case + Study for Russian + + +
+ Keyphrase selection plays a pivotal role within the domain of scholarly +texts, facilitating efficient information retrieval, summarization, and +indexing. In this work, we explored how to apply fine-tuned generative +transformer-based models to the specific task of keyphrase selection within +Russian scientific texts. We experimented with four distinct generative models, +such as ruT5, ruGPT, mT5, and mBART, and evaluated their performance in both +in-domain and cross-domain settings. The experiments were conducted on the +texts of Russian scientific abstracts from four domains: mathematics & computer +science, history, medicine, and linguistics. The use of generative models, +namely mBART, led to gains in in-domain performance (up to 4.9% in BERTScore, +9.0% in ROUGE-1, and 12.2% in F1-score) over three keyphrase extraction +baselines for the Russian language. Although the results for cross-domain usage +were significantly lower, they still demonstrated the capability to surpass +baseline performances in several cases, underscoring the promising potential +for further exploration and refinement in this research field. + +
+
+ comment: DAMDID-2024 +
+
+
+
+
+ + ♻ ☆ Propulsion: Steering LLM with Tiny Fine-Tuning + + +
+ The rapid advancements in Large Language Models (LLMs) have revolutionized +natural language processing (NLP) and related fields. However, fine-tuning +these models for specific tasks remains computationally expensive and risks +degrading pre-learned features. To address these challenges, we propose +Propulsion, a novel parameter efficient fine-tuning (PEFT) method designed to +optimize task-specific performance while drastically reducing computational +overhead. Inspired by the concept of controlled adjustments in physical motion, +Propulsion selectively re-scales specific dimensions of a pre-trained model, +guiding output predictions toward task objectives without modifying the model's +parameters. By introducing lightweight, trainable Propulsion parameters at the +pre-trained layer, we minimize the number of parameters updated during +fine-tuning, preventing overfitting or overwriting of existing knowledge. Our +theoretical analysis, supported by Neural Tangent Kernel (NTK) theory, shows +that Propulsion approximates the performance of full fine-tuning with far fewer +trainable parameters. Empirically, Propulsion reduces the parameter count from +355.3 million to just 0.086 million, achieving over a 10x reduction compared to +standard approaches like LoRA while maintaining competitive performance across +benchmarks. + +
+
+ comment: 26 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Towards No-Code Programming of Cobots: Experiments with Code Synthesis + by Large Code Models for Conversational Programming + + +
+ While there has been a lot of research recently on robots in household +environments, at the present time, most robots in existence can be found on +shop floors, and most interactions between humans and robots happen there. +``Collaborative robots'' (cobots) designed to work alongside humans on assembly +lines traditionally require expert programming, limiting ability to make +changes, or manual guidance, limiting expressivity of the resulting programs. +To address these limitations, we explore using Large Language Models (LLMs), +and in particular, their abilities of doing in-context learning, for +conversational code generation. As a first step, we define RATS, the +``Repetitive Assembly Task'', a 2D building task designed to lay the foundation +for simulating industry assembly scenarios. In this task, a `programmer' +instructs a cobot, using natural language, on how a certain assembly is to be +built; that is, the programmer induces a program, through natural language. We +create a dataset that pairs target structures with various example instructions +(human-authored, template-based, and model-generated) and example code. With +this, we systematically evaluate the capabilities of state-of-the-art LLMs for +synthesising this kind of code, given in-context examples. Evaluating in a +simulated environment, we find that LLMs are capable of generating accurate +`first order code' (instruction sequences), but have problems producing +`higher-order code' (abstractions such as functions, or use of loops). + +
+
+
+
+
+ + ♻ ☆ Unlock the Power of Frozen LLMs in Knowledge Graph Completion + + +
+ Traditional knowledge graph completion (KGC) methods rely solely on +structural information, struggling with the inherent sparsity of knowledge +graphs (KGs). Large Language Models (LLMs) learn extensive knowledge from large +corpora with powerful context modeling, making them promising for mitigating +the limitations of previous methods. Directly fine-tuning LLMs offers great +capability but comes at the cost of huge time and memory consumption, while +utilizing frozen LLMs yields suboptimal results.In this work, we aim to +leverage LLMs for KGC effectively and efficiently. We capture the context-aware +hidden states of knowledge triples by employing prompts to stimulate the +intermediate layers of LLMs. We then train a data-efficient classifier on these +hidden states to harness the inherent capabilities of frozen LLMs in KGC. +Additionally, to reduce ambiguity and enrich knowledge representation, we +generate detailed entity descriptions through subgraph sampling on KGs. +Extensive experiments on standard benchmarks demonstrate the efficiency and +effectiveness of our approach. We outperform traditional KGC methods across +most datasets and, notably, achieve classification performance comparable to +fine-tuned LLMs while enhancing GPU memory efficiency by $188\times$ and +accelerating training and inference by $13.48\times$. + +
+
+
+
+
+ + ♻ ☆ Guiding In-Context Learning of LLMs through Quality Estimation for + Machine Translation + + +
+ The quality of output from large language models (LLMs), particularly in +machine translation (MT), is closely tied to the quality of in-context examples +(ICEs) provided along with the query, i.e., the text to translate. The +effectiveness of these ICEs is influenced by various factors, such as the +domain of the source text, the order in which the ICEs are presented, the +number of these examples, and the prompt templates used. Naturally, selecting +the most impactful ICEs depends on understanding how these affect the resulting +translation quality, which ultimately relies on translation references or human +judgment. This paper presents a novel methodology for in-context learning (ICL) +that relies on a search algorithm guided by domain-specific quality estimation +(QE). Leveraging the XGLM model, our methodology estimates the resulting +translation quality without the need for translation references, selecting +effective ICEs for MT to maximize translation quality. Our results demonstrate +significant improvements over existing ICL methods and higher translation +performance compared to fine-tuning a pre-trained language model (PLM), +specifically mBART-50. + +
+
+ comment: Camera-ready version of the paper for the Association for Machine + Translation in the Americas (AMTA), including the link to the paper's + repository +
+
+
+
+
+ + ♻ ☆ Zero-resource Hallucination Detection for Text Generation via + Graph-based Contextual Knowledge Triples Modeling + + +
+ LLMs obtain remarkable performance but suffer from hallucinations. Most +research on detecting hallucination focuses on the questions with short and +concrete correct answers that are easy to check the faithfulness. Hallucination +detections for text generation with open-ended answers are more challenging. +Some researchers use external knowledge to detect hallucinations in generated +texts, but external resources for specific scenarios are hard to access. Recent +studies on detecting hallucinations in long text without external resources +conduct consistency comparison among multiple sampled outputs. To handle long +texts, researchers split long texts into multiple facts and individually +compare the consistency of each pairs of facts. However, these methods (1) +hardly achieve alignment among multiple facts; (2) overlook dependencies +between multiple contextual facts. In this paper, we propose a graph-based +context-aware (GCA) hallucination detection for text generations, which aligns +knowledge facts and considers the dependencies between contextual knowledge +triples in consistency comparison. Particularly, to align multiple facts, we +conduct a triple-oriented response segmentation to extract multiple knowledge +triples. To model dependencies among contextual knowledge triple (facts), we +construct contextual triple into a graph and enhance triples' interactions via +message passing and aggregating via RGCN. To avoid the omission of knowledge +triples in long text, we conduct a LLM-based reverse verification via +reconstructing the knowledge triples. Experiments show that our model enhances +hallucination detection and excels all baselines. + +
+
+
+
+
+ + ♻ ☆ Prior Constraints-based Reward Model Training for Aligning Large + Language Models CCL 2024 + + +
+ Reinforcement learning with human feedback for aligning large language models +(LLMs) trains a reward model typically using ranking loss with comparison +pairs.However, the training procedure suffers from an inherent problem: the +uncontrolled scaling of reward scores during reinforcement learning due to the +lack of constraints while training the reward model.This paper proposes a Prior +Constraints-based Reward Model (namely PCRM) training method to mitigate this +problem. PCRM incorporates prior constraints, specifically, length ratio and +cosine similarity between outputs of each comparison pair, during reward model +training to regulate optimization magnitude and control score margins. We +comprehensively evaluate PCRM by examining its rank correlation with human +preferences and its effectiveness in aligning LLMs via RL. Experimental results +demonstrate that PCRM significantly improves alignment performance by +effectively constraining reward score scaling. As another bonus, our method is +easily integrated into arbitrary rank-based alignment methods, such as direct +preference optimization, and can yield consistent improvement. + +
+
+ comment: Accepted by CCL 2024 +
+
+
+
+
+ + ♻ ☆ ASVD: Activation-aware Singular Value Decomposition for Compressing + Large Language Models + + +
+ In this paper, we introduce a new post-training compression paradigm for +Large Language Models (LLMs) to facilitate their wider adoption. We delve into +LLM weight low-rank factorization, and find that the challenges of this task +stem from the outlier phenomenon in the LLM activations and the sensitivity +difference among various kinds of layers. To address these issues, we propose a +training-free approach called Activation-aware Singular Value Decomposition +(ASVD). Specifically, ASVD manages activation outliers by scaling the weight +matrix based on the activation distribution, thereby enhancing decomposition +accuracy. Additionally, we propose an efficient iterative calibration process +to optimize layer-specific decomposition by addressing the varying sensitivity +of different LLM layers. ASVD can compress a network by 10-20%, without +compromising the performance of LLMs. Based on the success of the low-rank +decomposition of projection matrices in the self-attention module, we further +introduce ASVD to compress the KV cache. By reducing the channel dimension of +KV activations, memory requirements for KV cache can be largely reduced. Thanks +to the 50-75% reduction in the rank of the KV projection matrices, ASVD can +further achieve 50% KV cache reductions without performance drop in a +training-free manner. + +
+
+
+
+
+ + ♻ ☆ MiLoRA: Harnessing Minor Singular Components for Parameter-Efficient LLM + Finetuning + + +
+ Efficient finetuning of large language models (LLMs) aims to adapt the LLMs +with reduced computational and memory cost. Previous LoRA-based approaches +initialize the low-rank matrices with Gaussian distribution and zero values +while keeping the original weight matrices frozen. However, the trainable model +parameters optimized in an unguided subspace might interfere with the +well-learned subspace of the pretrained weight matrices. In this paper, we +propose MiLoRA, a simple yet effective LLM finetuning approach that only +updates the minor singular components of the weight matrix while keeping the +principal singular components frozen. It is observed that the minor matrix +corresponds to the noisy or long-tail information, while the principal matrix +contains important knowledge. The MiLoRA initializes the low-rank matrices +within a subspace that is orthogonal to the principal matrix, thus the +pretrained knowledge is expected to be well preserved. During finetuning, +MiLoRA makes the most use of the less-optimized subspace for learning the +labeled dataset. Extensive experiments on commonsense reasoning, math +reasoning, instruction following and visual instruction following benchmarks +present the superior performance of our method. + +
+
+
+
+
+ + ♻ ☆ MURRE: Multi-Hop Table Retrieval with Removal for Open-Domain + Text-to-SQL + + +
+ The open-domain text-to-SQL task aims to retrieve question-relevant tables +from massive databases and generate SQL. However, the performance of current +methods is constrained by single-hop retrieval, and existing multi-hop +retrieval of open-domain question answering is not directly applicable due to +the tendency to retrieve tables similar to the retrieved ones but irrelevant to +the question. Since the questions in text-to-SQL usually contain all required +information, while previous multi-hop retrieval supplements the questions with +retrieved documents. Therefore, we propose the multi-hop table retrieval with +removal (MURRE), which removes previously retrieved information from the +question to guide the retriever towards unretrieved relevant tables. Our +experiments on two open-domain text-to-SQL datasets demonstrate an average +improvement of 5.7% over the previous state-of-the-art results. + +
+
+
+
+
+ + ♻ ☆ Rethinking Kullback-Leibler Divergence in Knowledge Distillation for + Large Language Models + + +
+ Kullback-Leiber divergence has been widely used in Knowledge Distillation +(KD) to compress Large Language Models (LLMs). Contrary to prior assertions +that reverse Kullback-Leibler (RKL) divergence is mode-seeking and thus +preferable over the mean-seeking forward Kullback-Leibler (FKL) divergence, +this study empirically and theoretically demonstrates that neither mode-seeking +nor mean-seeking properties manifest in KD for LLMs. Instead, RKL and FKL are +found to share the same optimization objective and both converge after a +sufficient number of epochs. However, due to practical constraints, LLMs are +seldom trained for such an extensive number of epochs. Meanwhile, we further +find that RKL focuses on the tail part of the distributions, while FKL focuses +on the head part at the beginning epochs. Consequently, we propose a simple yet +effective Adaptive Kullback-Leiber (AKL) divergence method, which adaptively +allocates weights to combine FKL and RKL. Metric-based and GPT-4-based +evaluations demonstrate that the proposed AKL outperforms the baselines across +various tasks and improves the diversity and quality of generated responses. + +
+
+ comment: working in progress, code available at + https://github.com/wutaiqiang/LLM_KD_AKL +
+
+
+
+
+ + ♻ ☆ Towards Building a Robust Knowledge Intensive Question Answering Model + with Large Language Models NLPCC-2024 + + +
+ The development of LLMs has greatly enhanced the intelligence and fluency of +question answering, while the emergence of retrieval enhancement has enabled +models to better utilize external information. However, the presence of noise +and errors in retrieved information poses challenges to the robustness of LLMs. +In this work, to evaluate the model's performance under multiple interferences, +we first construct a dataset based on machine reading comprehension datasets +simulating various scenarios, including critical information absence, noise, +and conflicts. To address the issue of model accuracy decline caused by noisy +external information, we propose a data augmentation-based fine-tuning method +to enhance LLM's robustness against noise. Additionally, contrastive learning +approach is utilized to preserve the model's discrimination capability of +external information. We have conducted experiments on both existing LLMs and +our approach, the results are evaluated by GPT-4, which indicates that our +proposed methods improve model robustness while strengthening the model's +discrimination capability. + +
+
+ comment: This paper has been accepted by NLPCC-2024 +
+
+
+
+
+ + ♻ ☆ LLaVA-Docent: Instruction Tuning with Multimodal Large Language Model to + Support Art Appreciation Education + + +
+ Despite the development of various AI systems to support learning in various +domains, AI assistance for art appreciation education has not been extensively +explored. Art appreciation, often perceived as an unfamiliar and challenging +endeavor for most students, can be more accessible with a generative AI enabled +conversation partner that provides tailored questions and encourages the +audience to deeply appreciate artwork. This study explores the application of +multimodal large language models (MLLMs) in art appreciation education, with a +focus on developing LLaVA-Docent, a model designed to serve as a personal tutor +for art appreciation. Our approach involved design and development research, +focusing on iterative enhancement to design and develop the application to +produce a functional MLLM-enabled chatbot along with a data design framework +for art appreciation education. To that end, we established a virtual dialogue +dataset that was generated by GPT-4, which was instrumental in training our +MLLM, LLaVA-Docent. The performance of LLaVA-Docent was evaluated by +benchmarking it against alternative settings and revealed its distinct +strengths and weaknesses. Our findings highlight the efficacy of the MMLM-based +personalized art appreciation chatbot and demonstrate its applicability for a +novel approach in which art appreciation is taught and experienced. + +
+
+ comment: 37 pages, 4 figures, 10 tables +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Aspect-based Sentiment Analysis: Domains, + Methods, and Trends + + +
+ Aspect-based sentiment analysis (ABSA) is a fine-grained type of sentiment +analysis that identifies aspects and their associated opinions from a given +text. With the surge of digital opinionated text data, ABSA gained increasing +popularity for its ability to mine more detailed and targeted insights. Many +review papers on ABSA subtasks and solution methodologies exist, however, few +focus on trends over time or systemic issues relating to research application +domains, datasets, and solution approaches. To fill the gap, this paper +presents a systematic literature review (SLR) of ABSA studies with a focus on +trends and high-level relationships among these fundamental components. This +review is one of the largest SLRs on ABSA. To our knowledge, it is also the +first to systematically examine the interrelations among ABSA research and data +distribution across domains, as well as trends in solution paradigms and +approaches. Our sample includes 727 primary studies screened from 8550 search +results without time constraints via an innovative automatic filtering process. +Our quantitative analysis not only identifies trends in nearly two decades of +ABSA research development but also unveils a systemic lack of dataset and +domain diversity as well as domain mismatch that may hinder the development of +future ABSA research. We discuss these findings and their implications and +propose suggestions for future research. + +
+
+
+
+
+ + ♻ ☆ Natural Language Processing for Dialects of a Language: A Survey + + +
+ State-of-the-art natural language processing (NLP) models are trained on +massive training corpora, and report a superlative performance on evaluation +datasets. This survey delves into an important attribute of these datasets: the +dialect of a language. Motivated by the performance degradation of NLP models +for dialectic datasets and its implications for the equity of language +technologies, we survey past research in NLP for dialects in terms of datasets, +and approaches. We describe a wide range of NLP tasks in terms of two +categories: natural language understanding (NLU) (for tasks such as dialect +classification, sentiment analysis, parsing, and NLU benchmarks) and natural +language generation (NLG) (for summarisation, machine translation, and dialogue +systems). The survey is also broad in its coverage of languages which include +English, Arabic, German among others. We observe that past work in NLP +concerning dialects goes deeper than mere dialect classification, and . This +includes early approaches that used sentence transduction that lead to the +recent approaches that integrate hypernetworks into LoRA. We expect that this +survey will be useful to NLP researchers interested in building equitable +language technologies by rethinking LLM benchmarks and model architectures. + +
+
+ comment: The paper is under review at ACM Computing Surveys. Please reach out + to the authors in the case of feedback +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 113 + +
+
+
+ + ☆ Vista3D: Unravel the 3D Darkside of a Single Image ECCV'2024 + + +
+ We embark on the age-old quest: unveiling the hidden dimensions of objects +from mere glimpses of their visible parts. To address this, we present Vista3D, +a framework that realizes swift and consistent 3D generation within a mere 5 +minutes. At the heart of Vista3D lies a two-phase approach: the coarse phase +and the fine phase. In the coarse phase, we rapidly generate initial geometry +with Gaussian Splatting from a single image. In the fine phase, we extract a +Signed Distance Function (SDF) directly from learned Gaussian Splatting, +optimizing it with a differentiable isosurface representation. Furthermore, it +elevates the quality of generation by using a disentangled representation with +two independent implicit functions to capture both visible and obscured aspects +of objects. Additionally, it harmonizes gradients from 2D diffusion prior with +3D-aware diffusion priors by angular diffusion prior composition. Through +extensive evaluation, we demonstrate that Vista3D effectively sustains a +balance between the consistency and diversity of the generated 3D objects. +Demos and code will be available at https://github.com/florinshen/Vista3D. + +
+
+ comment: ECCV'2024 +
+
+
+
+
+ + ☆ DynaMo: In-Domain Dynamics Pretraining for Visuo-Motor Control + + +
+ Imitation learning has proven to be a powerful tool for training complex +visuomotor policies. However, current methods often require hundreds to +thousands of expert demonstrations to handle high-dimensional visual +observations. A key reason for this poor data efficiency is that visual +representations are predominantly either pretrained on out-of-domain data or +trained directly through a behavior cloning objective. In this work, we present +DynaMo, a new in-domain, self-supervised method for learning visual +representations. Given a set of expert demonstrations, we jointly learn a +latent inverse dynamics model and a forward dynamics model over a sequence of +image embeddings, predicting the next frame in latent space, without +augmentations, contrastive sampling, or access to ground truth actions. +Importantly, DynaMo does not require any out-of-domain data such as Internet +datasets or cross-embodied datasets. On a suite of six simulated and real +environments, we show that representations learned with DynaMo significantly +improve downstream imitation learning performance over prior self-supervised +learning objectives, and pretrained representations. Gains from using DynaMo +hold across policy classes such as Behavior Transformer, Diffusion Policy, MLP, +and nearest neighbors. Finally, we ablate over key components of DynaMo and +measure its impact on downstream policy performance. Robot videos are best +viewed at https://dynamo-ssl.github.io + +
+
+
+
+
+ + ☆ Qwen2-VL: Enhancing Vision-Language Model's Perception of the World at + Any Resolution + + +
+ We present the Qwen2-VL Series, an advanced upgrade of the previous Qwen-VL +models that redefines the conventional predetermined-resolution approach in +visual processing. Qwen2-VL introduces the Naive Dynamic Resolution mechanism, +which enables the model to dynamically process images of varying resolutions +into different numbers of visual tokens. This approach allows the model to +generate more efficient and accurate visual representations, closely aligning +with human perceptual processes. The model also integrates Multimodal Rotary +Position Embedding (M-RoPE), facilitating the effective fusion of positional +information across text, images, and videos. We employ a unified paradigm for +processing both images and videos, enhancing the model's visual perception +capabilities. To explore the potential of large multimodal models, Qwen2-VL +investigates the scaling laws for large vision-language models (LVLMs). By +scaling both the model size-with versions at 2B, 8B, and 72B parameters-and the +amount of training data, the Qwen2-VL Series achieves highly competitive +performance. Notably, the Qwen2-VL-72B model achieves results comparable to +leading models such as GPT-4o and Claude3.5-Sonnet across various multimodal +benchmarks, outperforming other generalist models. Code is available at +\url{https://github.com/QwenLM/Qwen2-VL}. + +
+
+ comment: Code is available at https://github.com/QwenLM/Qwen2-VL +
+
+
+
+
+ + ☆ Bundle Adjustment in the Eager Mode + + +
+ Bundle adjustment (BA) is a critical technique in various robotic +applications, such as simultaneous localization and mapping (SLAM), augmented +reality (AR), and photogrammetry. BA optimizes parameters such as camera poses +and 3D landmarks to align them with observations. With the growing importance +of deep learning in perception systems, there is an increasing need to +integrate BA with deep learning frameworks for enhanced reliability and +performance. However, widely-used C++-based BA frameworks, such as GTSAM, +g$^2$o, and Ceres, lack native integration with modern deep learning libraries +like PyTorch. This limitation affects their flexibility, adaptability, ease of +debugging, and overall implementation efficiency. To address this gap, we +introduce an eager-mode BA framework seamlessly integrated with PyPose, +providing PyTorch-compatible interfaces with high efficiency. Our approach +includes GPU-accelerated, differentiable, and sparse operations designed for +2nd-order optimization, Lie group and Lie algebra operations, and linear +solvers. Our eager-mode BA on GPU demonstrates substantial runtime efficiency, +achieving an average speedup of 18.5$\times$, 22$\times$, and 23$\times$ +compared to GTSAM, g$^2$o, and Ceres, respectively. + +
+
+
+
+
+ + ☆ Massively Multi-Person 3D Human Motion Forecasting with Scene Context + + +
+ Forecasting long-term 3D human motion is challenging: the stochasticity of +human behavior makes it hard to generate realistic human motion from the input +sequence alone. Information on the scene environment and the motion of nearby +people can greatly aid the generation process. We propose a scene-aware social +transformer model (SAST) to forecast long-term (10s) human motion motion. +Unlike previous models, our approach can model interactions between both widely +varying numbers of people and objects in a scene. We combine a temporal +convolutional encoder-decoder architecture with a Transformer-based bottleneck +that allows us to efficiently combine motion and scene information. We model +the conditional motion distribution using denoising diffusion models. We +benchmark our approach on the Humans in Kitchens dataset, which contains 1 to +16 persons and 29 to 50 objects that are visible simultaneously. Our model +outperforms other approaches in terms of realism and diversity on different +metrics and in a user study. Code is available at +https://github.com/felixbmuller/SAST. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ multiPI-TransBTS: A Multi-Path Learning Framework for Brain Tumor Image + Segmentation Based on Multi-Physical Information + + +
+ Brain Tumor Segmentation (BraTS) plays a critical role in clinical diagnosis, +treatment planning, and monitoring the progression of brain tumors. However, +due to the variability in tumor appearance, size, and intensity across +different MRI modalities, automated segmentation remains a challenging task. In +this study, we propose a novel Transformer-based framework, multiPI-TransBTS, +which integrates multi-physical information to enhance segmentation accuracy. +The model leverages spatial information, semantic information, and multi-modal +imaging data, addressing the inherent heterogeneity in brain tumor +characteristics. The multiPI-TransBTS framework consists of an encoder, an +Adaptive Feature Fusion (AFF) module, and a multi-source, multi-scale feature +decoder. The encoder incorporates a multi-branch architecture to separately +extract modality-specific features from different MRI sequences. The AFF module +fuses information from multiple sources using channel-wise and element-wise +attention, ensuring effective feature recalibration. The decoder combines both +common and task-specific features through a Task-Specific Feature Introduction +(TSFI) strategy, producing accurate segmentation outputs for Whole Tumor (WT), +Tumor Core (TC), and Enhancing Tumor (ET) regions. Comprehensive evaluations on +the BraTS2019 and BraTS2020 datasets demonstrate the superiority of +multiPI-TransBTS over the state-of-the-art methods. The model consistently +achieves better Dice coefficients, Hausdorff distances, and Sensitivity scores, +highlighting its effectiveness in addressing the BraTS challenges. Our results +also indicate the need for further exploration of the balance between precision +and recall in the ET segmentation task. The proposed framework represents a +significant advancement in BraTS, with potential implications for improving +clinical outcomes for brain tumor patients. + +
+
+
+
+
+ + ☆ Precise Forecasting of Sky Images Using Spatial Warping + + +
+ The intermittency of solar power, due to occlusion from cloud cover, is one +of the key factors inhibiting its widespread use in both commercial and +residential settings. Hence, real-time forecasting of solar irradiance for +grid-connected photovoltaic systems is necessary to schedule and allocate +resources across the grid. Ground-based imagers that capture wide field-of-view +images of the sky are commonly used to monitor cloud movement around a +particular site in an effort to forecast solar irradiance. However, these wide +FOV imagers capture a distorted image of sky image, where regions near the +horizon are heavily compressed. This hinders the ability to precisely predict +cloud motion near the horizon which especially affects prediction over longer +time horizons. In this work, we combat the aforementioned constraint by +introducing a deep learning method to predict a future sky image frame with +higher resolution than previous methods. Our main contribution is to derive an +optimal warping method to counter the adverse affects of clouds at the horizon, +and learn a framework for future sky image prediction which better determines +cloud evolution for longer time horizons. + +
+
+
+
+
+ + ☆ JEAN: Joint Expression and Audio-guided NeRF-based Talking Face + Generation BMVC 2024 + + +
+ We introduce a novel method for joint expression and audio-guided talking +face generation. Recent approaches either struggle to preserve the speaker +identity or fail to produce faithful facial expressions. To address these +challenges, we propose a NeRF-based network. Since we train our network on +monocular videos without any ground truth, it is essential to learn +disentangled representations for audio and expression. We first learn audio +features in a self-supervised manner, given utterances from multiple subjects. +By incorporating a contrastive learning technique, we ensure that the learned +audio features are aligned to the lip motion and disentangled from the muscle +motion of the rest of the face. We then devise a transformer-based architecture +that learns expression features, capturing long-range facial expressions and +disentangling them from the speech-specific mouth movements. Through +quantitative and qualitative evaluation, we demonstrate that our method can +synthesize high-fidelity talking face videos, achieving state-of-the-art facial +expression transfer along with lip synchronization to unseen audio. + +
+
+ comment: Accepted by BMVC 2024. Project Page: + https://starc52.github.io/publications/2024-07-19-JEAN +
+
+
+
+
+ + ☆ Autopet III challenge: Incorporating anatomical knowledge into nnUNet + for lesion segmentation in PET/CT + + +
+ Lesion segmentation in PET/CT imaging is essential for precise tumor +characterization, which supports personalized treatment planning and enhances +diagnostic precision in oncology. However, accurate manual segmentation of +lesions is time-consuming and prone to inter-observer variability. Given the +rising demand and clinical use of PET/CT, automated segmentation methods, +particularly deep-learning-based approaches, have become increasingly more +relevant. The autoPET III Challenge focuses on advancing automated segmentation +of tumor lesions in PET/CT images in a multitracer multicenter setting, +addressing the clinical need for quantitative, robust, and generalizable +solutions. Building on previous challenges, the third iteration of the autoPET +challenge introduces a more diverse dataset featuring two different tracers +(FDG and PSMA) from two clinical centers. To this extent, we developed a +classifier that identifies the tracer of the given PET/CT based on the Maximum +Intensity Projection of the PET scan. We trained two individual +nnUNet-ensembles for each tracer where anatomical labels are included as a +multi-label task to enhance the model's performance. Our final submission +achieves cross-validation Dice scores of 76.90% and 61.33% for the publicly +available FDG and PSMA datasets, respectively. The code is available at +https://github.com/hakal104/autoPETIII/ . + +
+
+ comment: AutoPET III challenge submission +
+
+
+
+
+ + ☆ MoRAG -- Multi-Fusion Retrieval Augmented Generation for Human Motion + + +
+ We introduce MoRAG, a novel multi-part fusion based retrieval-augmented +generation strategy for text-based human motion generation. The method enhances +motion diffusion models by leveraging additional knowledge obtained through an +improved motion retrieval process. By effectively prompting large language +models (LLMs), we address spelling errors and rephrasing issues in motion +retrieval. Our approach utilizes a multi-part retrieval strategy to improve the +generalizability of motion retrieval across the language space. We create +diverse samples through the spatial composition of the retrieved motions. +Furthermore, by utilizing low-level, part-specific motion information, we can +construct motion samples for unseen text descriptions. Our experiments +demonstrate that our framework can serve as a plug-and-play module, improving +the performance of motion diffusion models. Code, pretrained models and sample +videos will be made available at: https://motion-rag.github.io/ + +
+
+
+
+
+ + ☆ Optimal Visual Search with Highly Heuristic Decision Rules + + +
+ Visual search is a fundamental natural task for humans and other animals. We +investigated the decision processes humans use when searching briefly presented +displays having well-separated potential target-object locations. Performance +was compared with the Bayesian-optimal decision process under the assumption +that the information from the different potential target locations is +statistically independent. Surprisingly, humans performed slightly better than +optimal, despite humans' substantial loss of sensitivity in the fovea, and the +implausibility of the human brain replicating the optimal computations. We show +that three factors can quantitatively explain these seemingly paradoxical +results. Most importantly, simple and fixed heuristic decision rules reach near +optimal search performance. Secondly, foveal neglect primarily affects only the +central potential target location. Finally, spatially correlated neural noise +causes search performance to exceed that predicted for independent noise. These +findings have far-reaching implications for understanding visual search tasks +and other identification tasks in humans and other animals. + +
+
+
+
+
+ + ☆ Applications of Knowledge Distillation in Remote Sensing: A Survey + + +
+ With the ever-growing complexity of models in the field of remote sensing +(RS), there is an increasing demand for solutions that balance model accuracy +with computational efficiency. Knowledge distillation (KD) has emerged as a +powerful tool to meet this need, enabling the transfer of knowledge from large, +complex models to smaller, more efficient ones without significant loss in +performance. This review article provides an extensive examination of KD and +its innovative applications in RS. KD, a technique developed to transfer +knowledge from a complex, often cumbersome model (teacher) to a more compact +and efficient model (student), has seen significant evolution and application +across various domains. Initially, we introduce the fundamental concepts and +historical progression of KD methods. The advantages of employing KD are +highlighted, particularly in terms of model compression, enhanced computational +efficiency, and improved performance, which are pivotal for practical +deployments in RS scenarios. The article provides a comprehensive taxonomy of +KD techniques, where each category is critically analyzed to demonstrate the +breadth and depth of the alternative options, and illustrates specific case +studies that showcase the practical implementation of KD methods in RS tasks, +such as instance segmentation and object detection. Further, the review +discusses the challenges and limitations of KD in RS, including practical +constraints and prospective future directions, providing a comprehensive +overview for researchers and practitioners in the field of RS. Through this +organization, the paper not only elucidates the current state of research in KD +but also sets the stage for future research opportunities, thereby contributing +significantly to both academic research and real-world applications. + +
+
+ comment: 50 pages, 11 figures and 9 tables +
+
+
+
+
+ + ☆ SPRMamba: Surgical Phase Recognition for Endoscopic Submucosal + Dissection with Mamba + + +
+ Endoscopic Submucosal Dissection (ESD) is a minimally invasive procedure +initially designed for the treatment of early gastric cancer but is now widely +used for various gastrointestinal lesions. Computer-assisted Surgery systems +have played a crucial role in improving the precision and safety of ESD +procedures, however, their effectiveness is limited by the accurate recognition +of surgical phases. The intricate nature of ESD, with different lesion +characteristics and tissue structures, presents challenges for real-time +surgical phase recognition algorithms. Existing surgical phase recognition +algorithms struggle to efficiently capture temporal contexts in video-based +scenarios, leading to insufficient performance. To address these issues, we +propose SPRMamba, a novel Mamba-based framework for ESD surgical phase +recognition. SPRMamba leverages the strengths of Mamba for long-term temporal +modeling while introducing the Scaled Residual TranMamba block to enhance the +capture of fine-grained details, overcoming the limitations of traditional +temporal models like Temporal Convolutional Networks and Transformers. +Moreover, a Temporal Sample Strategy is introduced to accelerate the +processing, which is essential for real-time phase recognition in clinical +settings. Extensive testing on the ESD385 dataset and the cholecystectomy +Cholec80 dataset demonstrates that SPRMamba surpasses existing state-of-the-art +methods and exhibits greater robustness across various surgical phase +recognition tasks. + +
+
+
+
+
+ + ☆ Brain-Streams: fMRI-to-Image Reconstruction with Multi-modal Guidance + + +
+ Understanding how humans process visual information is one of the crucial +steps for unraveling the underlying mechanism of brain activity. Recently, this +curiosity has motivated the fMRI-to-image reconstruction task; given the fMRI +data from visual stimuli, it aims to reconstruct the corresponding visual +stimuli. Surprisingly, leveraging powerful generative models such as the Latent +Diffusion Model (LDM) has shown promising results in reconstructing complex +visual stimuli such as high-resolution natural images from vision datasets. +Despite the impressive structural fidelity of these reconstructions, they often +lack details of small objects, ambiguous shapes, and semantic nuances. +Consequently, the incorporation of additional semantic knowledge, beyond mere +visuals, becomes imperative. In light of this, we exploit how modern LDMs +effectively incorporate multi-modal guidance (text guidance, visual guidance, +and image layout) for structurally and semantically plausible image +generations. Specifically, inspired by the two-streams hypothesis suggesting +that perceptual and semantic information are processed in different brain +regions, our framework, Brain-Streams, maps fMRI signals from these brain +regions to appropriate embeddings. That is, by extracting textual guidance from +semantic information regions and visual guidance from perceptual information +regions, Brain-Streams provides accurate multi-modal guidance to LDMs. We +validate the reconstruction ability of Brain-Streams both quantitatively and +qualitatively on a real fMRI dataset comprising natural image stimuli and fMRI +data. + +
+
+
+
+
+ + ☆ Denoising diffusion models for high-resolution microscopy image + restoration + + +
+ Advances in microscopy imaging enable researchers to visualize structures at +the nanoscale level thereby unraveling intricate details of biological +organization. However, challenges such as image noise, photobleaching of +fluorophores, and low tolerability of biological samples to high light doses +remain, restricting temporal resolutions and experiment durations. Reduced +laser doses enable longer measurements at the cost of lower resolution and +increased noise, which hinders accurate downstream analyses. Here we train a +denoising diffusion probabilistic model (DDPM) to predict high-resolution +images by conditioning the model on low-resolution information. Additionally, +the probabilistic aspect of the DDPM allows for repeated generation of images +that tend to further increase the signal-to-noise ratio. We show that our model +achieves a performance that is better or similar to the previously +best-performing methods, across four highly diverse datasets. Importantly, +while any of the previous methods show competitive performance for some, but +not all datasets, our method consistently achieves high performance across all +four data sets, suggesting high generalizability. + +
+
+
+
+
+ + ☆ Online Refractive Camera Model Calibration in Visual Inertial Odometry IROS 2024 + + +
+ This paper presents a general refractive camera model and online +co-estimation of odometry and the refractive index of unknown media. This +enables operation in diverse and varying refractive fluids, given only the +camera calibration in air. The refractive index is estimated online as a state +variable of a monocular visual-inertial odometry framework in an iterative +formulation using the proposed camera model. The method was verified on data +collected using an underwater robot traversing inside a pool. The evaluations +demonstrate convergence to the ideal refractive index for water despite +significant perturbations in the initialization. Simultaneously, the approach +enables on-par visual-inertial odometry performance in refractive media without +prior knowledge of the refractive index or requirement of medium-specific +camera calibration. + +
+
+ comment: Accepted at the 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024), 8 pages +
+
+
+
+
+ + ☆ PAD-FT: A Lightweight Defense for Backdoor Attacks via Data Purification + and Fine-Tuning + + +
+ Backdoor attacks pose a significant threat to deep neural networks, +particularly as recent advancements have led to increasingly subtle +implantation, making the defense more challenging. Existing defense mechanisms +typically rely on an additional clean dataset as a standard reference and +involve retraining an auxiliary model or fine-tuning the entire victim model. +However, these approaches are often computationally expensive and not always +feasible in practical applications. In this paper, we propose a novel and +lightweight defense mechanism, termed PAD-FT, that does not require an +additional clean dataset and fine-tunes only a very small part of the model to +disinfect the victim model. To achieve this, our approach first introduces a +simple data purification process to identify and select the most-likely clean +data from the poisoned training dataset. The self-purified clean dataset is +then used for activation clipping and fine-tuning only the last classification +layer of the victim model. By integrating data purification, activation +clipping, and classifier fine-tuning, our mechanism PAD-FT demonstrates +superior effectiveness across multiple backdoor attack methods and datasets, as +confirmed through extensive experimental evaluation. + +
+
+
+
+
+ + ☆ SFDA-rPPG: Source-Free Domain Adaptive Remote Physiological Measurement + with Spatio-Temporal Consistency + + +
+ Remote Photoplethysmography (rPPG) is a non-contact method that uses facial +video to predict changes in blood volume, enabling physiological metrics +measurement. Traditional rPPG models often struggle with poor generalization +capacity in unseen domains. Current solutions to this problem is to improve its +generalization in the target domain through Domain Generalization (DG) or +Domain Adaptation (DA). However, both traditional methods require access to +both source domain data and target domain data, which cannot be implemented in +scenarios with limited access to source data, and another issue is the privacy +of accessing source domain data. In this paper, we propose the first +Source-free Domain Adaptation benchmark for rPPG measurement (SFDA-rPPG), which +overcomes these limitations by enabling effective domain adaptation without +access to source domain data. Our framework incorporates a Three-Branch +Spatio-Temporal Consistency Network (TSTC-Net) to enhance feature consistency +across domains. Furthermore, we propose a new rPPG distribution alignment loss +based on the Frequency-domain Wasserstein Distance (FWD), which leverages +optimal transport to align power spectrum distributions across domains +effectively and further enforces the alignment of the three branches. Extensive +cross-domain experiments and ablation studies demonstrate the effectiveness of +our proposed method in source-free domain adaptation settings. Our findings +highlight the significant contribution of the proposed FWD loss for +distributional alignment, providing a valuable reference for future research +and applications. The source code is available at +https://github.com/XieYiping66/SFDA-rPPG + +
+
+
+
+
+ + ☆ Multi-Sensor Deep Learning for Glacier Mapping + + +
+ The more than 200,000 glaciers outside the ice sheets play a crucial role in +our society by influencing sea-level rise, water resource management, natural +hazards, biodiversity, and tourism. However, only a fraction of these glaciers +benefit from consistent and detailed in-situ observations that allow for +assessing their status and changes over time. This limitation can, in part, be +overcome by relying on satellite-based Earth Observation techniques. +Satellite-based glacier mapping applications have historically mainly relied on +manual and semi-automatic detection methods, while recently, a fast and notable +transition to deep learning techniques has started. + This chapter reviews how combining multi-sensor remote sensing data and deep +learning allows us to better delineate (i.e. map) glaciers and detect their +temporal changes. We explain how relying on deep learning multi-sensor +frameworks to map glaciers benefits from the extensive availability of regional +and global glacier inventories. We also analyse the rationale behind glacier +mapping, the benefits of deep learning methodologies, and the inherent +challenges in integrating multi-sensor earth observation data with deep +learning algorithms. + While our review aims to provide a broad overview of glacier mapping efforts, +we highlight a few setups where deep learning multi-sensor remote sensing +applications have a considerable potential added value. This includes +applications for debris-covered and rock glaciers that are visually difficult +to distinguish from surroundings and for calving glaciers that are in contact +with the ocean. These specific cases are illustrated through a series of visual +imageries, highlighting some significant advantages and challenges when +detecting glacier changes, including dealing with seasonal snow cover, changing +debris coverage, and distinguishing glacier fronts from the surrounding sea +ice. + +
+
+ comment: This article will be a chapter of the book Deep Learning for + Multi-Sensor Earth Observation, to be published by Elsevier +
+
+
+
+
+ + ☆ PhysMamba: Efficient Remote Physiological Measurement with SlowFast + Temporal Difference Mamba + + +
+ Facial-video based Remote photoplethysmography (rPPG) aims at measuring +physiological signals and monitoring heart activity without any contact, +showing significant potential in various applications. Previous deep learning +based rPPG measurement are primarily based on CNNs and Transformers. However, +the limited receptive fields of CNNs restrict their ability to capture +long-range spatio-temporal dependencies, while Transformers also struggle with +modeling long video sequences with high complexity. Recently, the state space +models (SSMs) represented by Mamba are known for their impressive performance +on capturing long-range dependencies from long sequences. In this paper, we +propose the PhysMamba, a Mamba-based framework, to efficiently represent +long-range physiological dependencies from facial videos. Specifically, we +introduce the Temporal Difference Mamba block to first enhance local dynamic +differences and further model the long-range spatio-temporal context. Moreover, +a dual-stream SlowFast architecture is utilized to fuse the multi-scale +temporal features. Extensive experiments are conducted on three benchmark +datasets to demonstrate the superiority and efficiency of PhysMamba. The codes +are available at https://github.com/Chaoqi31/PhysMamba + +
+
+ comment: Accepted by CCBR 2024 +
+
+
+
+
+ + ☆ On Vision Transformers for Classification Tasks in Side-Scan Sonar + Imagery + + +
+ Side-scan sonar (SSS) imagery presents unique challenges in the +classification of man-made objects on the seafloor due to the complex and +varied underwater environments. Historically, experts have manually interpreted +SSS images, relying on conventional machine learning techniques with +hand-crafted features. While Convolutional Neural Networks (CNNs) significantly +advanced automated classification in this domain, they often fall short when +dealing with diverse seafloor textures, such as rocky or ripple sand bottoms, +where false positive rates may increase. Recently, Vision Transformers (ViTs) +have shown potential in addressing these limitations by utilizing a +self-attention mechanism to capture global information in image patches, +offering more flexibility in processing spatial hierarchies. This paper +rigorously compares the performance of ViT models alongside commonly used CNN +architectures, such as ResNet and ConvNext, for binary classification tasks in +SSS imagery. The dataset encompasses diverse geographical seafloor types and is +balanced between the presence and absence of man-made objects. ViT-based models +exhibit superior classification performance across f1-score, precision, recall, +and accuracy metrics, although at the cost of greater computational resources. +CNNs, with their inductive biases, demonstrate better computational efficiency, +making them suitable for deployment in resource-constrained environments like +underwater vehicles. Future research directions include exploring +self-supervised learning for ViTs and multi-modal fusion to further enhance +performance in challenging underwater environments. + +
+
+
+
+
+ + ☆ LEMON: Localized Editing with Mesh Optimization and Neural Shaders + + +
+ In practical use cases, polygonal mesh editing can be faster than generating +new ones, but it can still be challenging and time-consuming for users. +Existing solutions for this problem tend to focus on a single task, either +geometry or novel view synthesis, which often leads to disjointed results +between the mesh and view. In this work, we propose LEMON, a mesh editing +pipeline that combines neural deferred shading with localized mesh +optimization. Our approach begins by identifying the most important vertices in +the mesh for editing, utilizing a segmentation model to focus on these key +regions. Given multi-view images of an object, we optimize a neural shader and +a polygonal mesh while extracting the normal map and the rendered image from +each view. By using these outputs as conditioning data, we edit the input +images with a text-to-image diffusion model and iteratively update our dataset +while deforming the mesh. This process results in a polygonal mesh that is +edited according to the given text instruction, preserving the geometric +characteristics of the initial mesh while focusing on the most significant +areas. We evaluate our pipeline using the DTU dataset, demonstrating that it +generates finely-edited meshes more rapidly than the current state-of-the-art +methods. We include our code and additional results in the supplementary +material. + +
+
+
+
+
+ + ☆ Computational Imaging for Long-Term Prediction of Solar Irradiance + + +
+ The occlusion of the sun by clouds is one of the primary sources of +uncertainties in solar power generation, and is a factor that affects the +wide-spread use of solar power as a primary energy source. Real-time +forecasting of cloud movement and, as a result, solar irradiance is necessary +to schedule and allocate energy across grid-connected photovoltaic systems. +Previous works monitored cloud movement using wide-angle field of view imagery +of the sky. However, such images have poor resolution for clouds that appear +near the horizon, which reduces their effectiveness for long term prediction of +solar occlusion. Specifically, to be able to predict occlusion of the sun over +long time periods, clouds that are near the horizon need to be detected, and +their velocities estimated precisely. To enable such a system, we design and +deploy a catadioptric system that delivers wide-angle imagery with uniform +spatial resolution of the sky over its field of view. To enable prediction over +a longer time horizon, we design an algorithm that uses carefully selected +spatio-temporal slices of the imagery using estimated wind direction and +velocity as inputs. Using ray-tracing simulations as well as a real testbed +deployed outdoors, we show that the system is capable of predicting solar +occlusion as well as irradiance for tens of minutes in the future, which is an +order of magnitude improvement over prior work. + +
+
+
+
+
+ + ☆ Mixture of Prompt Learning for Vision Language Models + + +
+ As powerful pre-trained vision-language models (VLMs) like CLIP gain +prominence, numerous studies have attempted to combine VLMs for downstream +tasks. Among these, prompt learning has been validated as an effective method +for adapting to new tasks, which only requiring a small number of parameters. +However, current prompt learning methods face two challenges: first, a single +soft prompt struggles to capture the diverse styles and patterns within a +dataset; second, fine-tuning soft prompts is prone to overfitting. To address +these challenges, we propose a mixture of soft prompt learning method +incorporating a routing module. This module is able to capture a dataset's +varied styles and dynamically selects the most suitable prompts for each +instance. Additionally, we introduce a novel gating mechanism to ensure the +router selects prompts based on their similarity to hard prompt templates, +which both retaining knowledge from hard prompts and improving selection +accuracy. We also implement semantically grouped text-level supervision, +initializing each soft prompt with the token embeddings of manually designed +templates from its group and applied a contrastive loss between the resulted +text feature and hard prompt encoded text feature. This supervision ensures +that the text features derived from soft prompts remain close to those from +their corresponding hard prompts, preserving initial knowledge and mitigating +overfitting. Our method has been validated on 11 datasets, demonstrating +evident improvements in few-shot learning, domain generalization, and +base-to-new generalization scenarios compared to existing baselines. The code +will be available at \url{https://anonymous.4open.science/r/mocoop-6387} + +
+
+
+
+
+ + ☆ ChefFusion: Multimodal Foundation Model Integrating Recipe and Food + Image Generation + + +
+ Significant work has been conducted in the domain of food computing, yet +these studies typically focus on single tasks such as t2t (instruction +generation from food titles and ingredients), i2t (recipe generation from food +images), or t2i (food image generation from recipes). None of these approaches +integrate all modalities simultaneously. To address this gap, we introduce a +novel food computing foundation model that achieves true multimodality, +encompassing tasks such as t2t, t2i, i2t, it2t, and t2ti. By leveraging large +language models (LLMs) and pre-trained image encoder and decoder models, our +model can perform a diverse array of food computing-related tasks, including +food understanding, food recognition, recipe generation, and food image +generation. Compared to previous models, our foundation model demonstrates a +significantly broader range of capabilities and exhibits superior performance, +particularly in food image generation and recipe generation tasks. We +open-sourced ChefFusion at GitHub. + +
+
+
+
+
+ + ☆ Panoptic-Depth Forecasting + + +
+ Forecasting the semantics and 3D structure of scenes is essential for robots +to navigate and plan actions safely. Recent methods have explored semantic and +panoptic scene forecasting; however, they do not consider the geometry of the +scene. In this work, we propose the panoptic-depth forecasting task for jointly +predicting the panoptic segmentation and depth maps of unobserved future +frames, from monocular camera images. To facilitate this work, we extend the +popular KITTI-360 and Cityscapes benchmarks by computing depth maps from LiDAR +point clouds and leveraging sequential labeled data. We also introduce a +suitable evaluation metric that quantifies both the panoptic quality and depth +estimation accuracy of forecasts in a coherent manner. Furthermore, we present +two baselines and propose the novel PDcast architecture that learns rich +spatio-temporal representations by incorporating a transformer-based encoder, a +forecasting module, and task-specific decoders to predict future panoptic-depth +outputs. Extensive evaluations demonstrate the effectiveness of PDcast across +two datasets and three forecasting tasks, consistently addressing the primary +challenges. We make the code publicly available at +https://pdcast.cs.uni-freiburg.de. + +
+
+
+
+
+ + ☆ Towards Global Localization using Multi-Modal Object-Instance + Re-Identification ICRA 2025 + + +
+ Re-identification (ReID) is a critical challenge in computer vision, +predominantly studied in the context of pedestrians and vehicles. However, +robust object-instance ReID, which has significant implications for tasks such +as autonomous exploration, long-term perception, and scene understanding, +remains underexplored. In this work, we address this gap by proposing a novel +dual-path object-instance re-identification transformer architecture that +integrates multimodal RGB and depth information. By leveraging depth data, we +demonstrate improvements in ReID across scenes that are cluttered or have +varying illumination conditions. Additionally, we develop a ReID-based +localization framework that enables accurate camera localization and pose +identification across different viewpoints. We validate our methods using two +custom-built RGB-D datasets, as well as multiple sequences from the open-source +TUM RGB-D datasets. Our approach demonstrates significant improvements in both +object instance ReID (mAP of 75.18) and localization accuracy (success rate of +83% on TUM-RGBD), highlighting the essential role of object ReID in advancing +robotic perception. Our models, frameworks, and datasets have been made +publicly available. + +
+
+ comment: 8 pages, 5 figures, 3 tables. Submitted to ICRA 2025 +
+
+
+
+
+ + ☆ Intraoperative Registration by Cross-Modal Inverse Neural Rendering MICCAI 2024 + + +
+ We present in this paper a novel approach for 3D/2D intraoperative +registration during neurosurgery via cross-modal inverse neural rendering. Our +approach separates implicit neural representation into two components, handling +anatomical structure preoperatively and appearance intraoperatively. This +disentanglement is achieved by controlling a Neural Radiance Field's appearance +with a multi-style hypernetwork. Once trained, the implicit neural +representation serves as a differentiable rendering engine, which can be used +to estimate the surgical camera pose by minimizing the dissimilarity between +its rendered images and the target intraoperative image. We tested our method +on retrospective patients' data from clinical cases, showing that our method +outperforms state-of-the-art while meeting current clinical standards for +registration. Code and additional resources can be found at +https://maxfehrentz.github.io/style-ngp/. + +
+
+ comment: Accepted at MICCAI 2024 +
+
+
+
+
+ + ☆ MitoSeg: Mitochondria Segmentation Tool + + +
+ Recent studies suggest a potential link between the physical structure of +mitochondria and neurodegenerative diseases. With advances in Electron +Microscopy techniques, it has become possible to visualize the boundary and +internal membrane structures of mitochondria in detail. It is crucial to +automatically segment mitochondria from these images to investigate the +relationship between mitochondria and diseases. In this paper, we present a +software solution for mitochondrial segmentation, highlighting mitochondria +boundaries in electron microscopy tomography images and generating +corresponding 3D meshes. + +
+
+
+
+
+ + ☆ Unveiling the Black Box: Independent Functional Module Evaluation for + Bird's-Eye-View Perception Model + + +
+ End-to-end models are emerging as the mainstream in autonomous driving +perception. However, the inability to meticulously deconstruct their internal +mechanisms results in diminished development efficacy and impedes the +establishment of trust. Pioneering in the issue, we present the Independent +Functional Module Evaluation for Bird's-Eye-View Perception Model (BEV-IFME), a +novel framework that juxtaposes the module's feature maps against Ground Truth +within a unified semantic Representation Space to quantify their similarity, +thereby assessing the training maturity of individual functional modules. The +core of the framework lies in the process of feature map encoding and +representation aligning, facilitated by our proposed two-stage Alignment +AutoEncoder, which ensures the preservation of salient information and the +consistency of feature structure. The metric for evaluating the training +maturity of functional modules, Similarity Score, demonstrates a robust +positive correlation with BEV metrics, with an average correlation coefficient +of 0.9387, attesting to the framework's reliability for assessment purposes. + +
+
+
+
+
+ + ☆ A Chinese Continuous Sign Language Dataset Based on Complex Environments + + +
+ The current bottleneck in continuous sign language recognition (CSLR) +research lies in the fact that most publicly available datasets are limited to +laboratory environments or television program recordings, resulting in a single +background environment with uniform lighting, which significantly deviates from +the diversity and complexity found in real-life scenarios. To address this +challenge, we have constructed a new, large-scale dataset for Chinese +continuous sign language (CSL) based on complex environments, termed the +complex environment - chinese sign language dataset (CE-CSL). This dataset +encompasses 5,988 continuous CSL video clips collected from daily life scenes, +featuring more than 70 different complex backgrounds to ensure +representativeness and generalization capability. To tackle the impact of +complex backgrounds on CSLR performance, we propose a time-frequency network +(TFNet) model for continuous sign language recognition. This model extracts +frame-level features and then utilizes both temporal and spectral information +to separately derive sequence features before fusion, aiming to achieve +efficient and accurate CSLR. Experimental results demonstrate that our approach +achieves significant performance improvements on the CE-CSL, validating its +effectiveness under complex background conditions. Additionally, our proposed +method has also yielded highly competitive results when applied to three +publicly available CSL datasets. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Tracking Any Point with Frame-Event Fusion Network at High Frame Rate + + +
+ Tracking any point based on image frames is constrained by frame rates, +leading to instability in high-speed scenarios and limited generalization in +real-world applications. To overcome these limitations, we propose an +image-event fusion point tracker, FE-TAP, which combines the contextual +information from image frames with the high temporal resolution of events, +achieving high frame rate and robust point tracking under various challenging +conditions. Specifically, we designed an Evolution Fusion module (EvoFusion) to +model the image generation process guided by events. This module can +effectively integrate valuable information from both modalities operating at +different frequencies. To achieve smoother point trajectories, we employed a +transformer-based refinement strategy that updates the point's trajectories and +features iteratively. Extensive experiments demonstrate that our method +outperforms state-of-the-art approaches, particularly improving expected +feature age by 24$\%$ on EDS datasets. Finally, we qualitatively validated the +robustness of our algorithm in real driving scenarios using our custom-designed +high-resolution image-event synchronization device. Our source code will be +released at https://github.com/ljx1002/FE-TAP. + +
+
+
+
+
+ + ☆ GaussianHeads: End-to-End Learning of Drivable Gaussian Head Avatars + from Coarse-to-fine Representations SIGGRAPH + + +
+ Real-time rendering of human head avatars is a cornerstone of many computer +graphics applications, such as augmented reality, video games, and films, to +name a few. Recent approaches address this challenge with computationally +efficient geometry primitives in a carefully calibrated multi-view setup. +Albeit producing photorealistic head renderings, it often fails to represent +complex motion changes such as the mouth interior and strongly varying head +poses. We propose a new method to generate highly dynamic and deformable human +head avatars from multi-view imagery in real-time. At the core of our method is +a hierarchical representation of head models that allows to capture the complex +dynamics of facial expressions and head movements. First, with rich facial +features extracted from raw input frames, we learn to deform the coarse facial +geometry of the template mesh. We then initialize 3D Gaussians on the deformed +surface and refine their positions in a fine step. We train this coarse-to-fine +facial avatar model along with the head pose as a learnable parameter in an +end-to-end framework. This enables not only controllable facial animation via +video inputs, but also high-fidelity novel view synthesis of challenging facial +expressions, such as tongue deformations and fine-grained teeth structure under +large motion changes. Moreover, it encourages the learned head avatar to +generalize towards new facial expressions and head poses at inference time. We +demonstrate the performance of our method with comparisons against the related +methods on different datasets, spanning challenging facial expression sequences +across multiple identities. We also show the potential application of our +approach by demonstrating a cross-identity facial performance transfer +application. + +
+
+ comment: ACM Transaction on Graphics (SIGGRAPH Asia 2024); Project page: + https://vcai.mpi-inf.mpg.de/projects/GaussianHeads/ +
+
+
+
+
+ + ☆ Differentiable Collision-Supervised Tooth Arrangement Network with a + Decoupling Perspective + + +
+ Tooth arrangement is an essential step in the digital orthodontic planning +process. Existing learning-based methods use hidden teeth features to directly +regress teeth motions, which couples target pose perception and motion +regression. It could lead to poor perceptions of three-dimensional +transformation. They also ignore the possible overlaps or gaps between teeth of +predicted dentition, which is generally unacceptable. Therefore, we propose +DTAN, a differentiable collision-supervised tooth arrangement network, +decoupling predicting tasks and feature modeling. DTAN decouples the tooth +arrangement task by first predicting the hidden features of the final teeth +poses and then using them to assist in regressing the motions between the +beginning and target teeth. To learn the hidden features better, DTAN also +decouples the teeth-hidden features into geometric and positional features, +which are further supervised by feature consistency constraints. Furthermore, +we propose a novel differentiable collision loss function for point cloud data +to constrain the related gestures between teeth, which can be easily extended +to other 3D point cloud tasks. We propose an arch-width guided tooth +arrangement network, named C-DTAN, to make the results controllable. We +construct three different tooth arrangement datasets and achieve drastically +improved performance on accuracy and speed compared with existing methods. + +
+
+ comment: 16 pages, 13 figures +
+
+
+
+
+ + ☆ Agglomerative Token Clustering ECCV 2024 + + +
+ We present Agglomerative Token Clustering (ATC), a novel token merging method +that consistently outperforms previous token merging and pruning methods across +image classification, image synthesis, and object detection & segmentation +tasks. ATC merges clusters through bottom-up hierarchical clustering, without +the introduction of extra learnable parameters. We find that ATC achieves +state-of-the-art performance across all tasks, and can even perform on par with +prior state-of-the-art when applied off-the-shelf, i.e. without fine-tuning. +ATC is particularly effective when applied with low keep rates, where only a +small fraction of tokens are kept and retaining task performance is especially +difficult. + +
+
+ comment: ECCV 2024. Project webpage at https://vap.aau.dk/atc/ +
+
+
+
+
+ + ☆ Generation of Complex 3D Human Motion by Temporal and Spatial + Composition of Diffusion Models + + +
+ In this paper, we address the challenge of generating realistic 3D human +motions for action classes that were never seen during the training phase. Our +approach involves decomposing complex actions into simpler movements, +specifically those observed during training, by leveraging the knowledge of +human motion contained in GPTs models. These simpler movements are then +combined into a single, realistic animation using the properties of diffusion +models. Our claim is that this decomposition and subsequent recombination of +simple movements can synthesize an animation that accurately represents the +complex input action. This method operates during the inference phase and can +be integrated with any pre-trained diffusion model, enabling the synthesis of +motion classes not present in the training data. We evaluate our method by +dividing two benchmark human motion datasets into basic and complex actions, +and then compare its performance against the state-of-the-art. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ LLM-wrapper: Black-Box Semantic-Aware Adaptation of Vision-Language + Foundation Models ECCV 2024 + + +
+ Vision Language Models (VLMs) have shown impressive performances on numerous +tasks but their zero-shot capabilities can be limited compared to dedicated or +fine-tuned models. Yet, fine-tuning VLMs comes with limitations as it requires +`white-box' access to the model's architecture and weights as well as expertise +to design the fine-tuning objectives and optimize the hyper-parameters, which +are specific to each VLM and downstream task. In this work, we propose +LLM-wrapper, a novel approach to adapt VLMs in a `black-box' manner by +leveraging large language models (LLMs) so as to reason on their outputs. We +demonstrate the effectiveness of LLM-wrapper on Referring Expression +Comprehension (REC), a challenging open-vocabulary task that requires spatial +and semantic reasoning. Our approach significantly boosts the performance of +off-the-shelf models, resulting in competitive results when compared with +classic fine-tuning. + +
+
+ comment: EVAL-FoMo workshop, ECCV 2024 +
+
+
+
+
+ + ☆ Tumor aware recurrent inter-patient deformable image registration of + computed tomography scans with lung cancer + + +
+ Background: Voxel-based analysis (VBA) for population level radiotherapy (RT) +outcomes modeling requires topology preserving inter-patient deformable image +registration (DIR) that preserves tumors on moving images while avoiding +unrealistic deformations due to tumors occurring on fixed images. Purpose: We +developed a tumor-aware recurrent registration (TRACER) deep learning (DL) +method and evaluated its suitability for VBA. Methods: TRACER consists of +encoder layers implemented with stacked 3D convolutional long short term memory +network (3D-CLSTM) followed by decoder and spatial transform layers to compute +dense deformation vector field (DVF). Multiple CLSTM steps are used to compute +a progressive sequence of deformations. Input conditioning was applied by +including tumor segmentations with 3D image pairs as input channels. +Bidirectional tumor rigidity, image similarity, and deformation smoothness +losses were used to optimize the network in an unsupervised manner. TRACER and +multiple DL methods were trained with 204 3D CT image pairs from patients with +lung cancers (LC) and evaluated using (a) Dataset I (N = 308 pairs) with DL +segmented LCs, (b) Dataset II (N = 765 pairs) with manually delineated LCs, and +(c) Dataset III with 42 LC patients treated with RT. Results: TRACER accurately +aligned normal tissues. It best preserved tumors, blackindicated by the +smallest tumor volume difference of 0.24\%, 0.40\%, and 0.13 \% and mean square +error in CT intensities of 0.005, 0.005, 0.004, computed between original and +resampled moving image tumors, for Datasets I, II, and III, respectively. It +resulted in the smallest planned RT tumor dose difference computed between +original and resampled moving images of 0.01 Gy and 0.013 Gy when using a +female and a male reference. + +
+
+ comment: Minor revision under the journal of Medical Physics +
+
+
+
+
+ + ☆ Finding the Subjective Truth: Collecting 2 Million Votes for + Comprehensive Gen-AI Model Evaluation + + +
+ Efficiently evaluating the performance of text-to-image models is difficult +as it inherently requires subjective judgment and human preference, making it +hard to compare different models and quantify the state of the art. Leveraging +Rapidata's technology, we present an efficient annotation framework that +sources human feedback from a diverse, global pool of annotators. Our study +collected over 2 million annotations across 4,512 images, evaluating four +prominent models (DALL-E 3, Flux.1, MidJourney, and Stable Diffusion) on style +preference, coherence, and text-to-image alignment. We demonstrate that our +approach makes it feasible to comprehensively rank image generation models +based on a vast pool of annotators and show that the diverse annotator +demographics reflect the world population, significantly decreasing the risk of +biases. + +
+
+
+
+
+ + ☆ ABHINAW: A method for Automatic Evaluation of Typography within + AI-Generated Images + + +
+ In the fast-evolving field of Generative AI, platforms like MidJourney, +DALL-E, and Stable Diffusion have transformed Text-to-Image (T2I) Generation. +However, despite their impressive ability to create high-quality images, they +often struggle to generate accurate text within these images. Theoretically, if +we could achieve accurate text generation in AI images in a ``zero-shot'' +manner, it would not only make AI-generated images more meaningful but also +democratize the graphic design industry. The first step towards this goal is to +create a robust scoring matrix for evaluating text accuracy in AI-generated +images. Although there are existing bench-marking methods like CLIP SCORE and +T2I-CompBench++, there's still a gap in systematically evaluating text and +typography in AI-generated images, especially with diffusion-based methods. In +this paper, we introduce a novel evaluation matrix designed explicitly for +quantifying the performance of text and typography generation within +AI-generated images. We have used letter by letter matching strategy to compute +the exact matching scores from the reference text to the AI generated text. Our +novel approach to calculate the score takes care of multiple redundancies such +as repetition of words, case sensitivity, mixing of words, irregular +incorporation of letters etc. Moreover, we have developed a Novel method named +as brevity adjustment to handle excess text. In addition we have also done a +quantitative analysis of frequent errors arise due to frequently used words and +less frequently used words. Project page is available at: +https://github.com/Abhinaw3906/ABHINAW-MATRIX. + +
+
+
+
+
+ + ☆ SpheriGait: Enriching Spatial Representation via Spherical Projection + for LiDAR-based Gait Recognition + + +
+ Gait recognition is a rapidly progressing technique for the remote +identification of individuals. Prior research predominantly employing 2D +sensors to gather gait data has achieved notable advancements; nonetheless, +they have unavoidably neglected the influence of 3D dynamic characteristics on +recognition. Gait recognition utilizing LiDAR 3D point clouds not only directly +captures 3D spatial features but also diminishes the impact of lighting +conditions while ensuring privacy protection.The essence of the problem lies in +how to effectively extract discriminative 3D dynamic representation from point +clouds.In this paper, we proposes a method named SpheriGait for extracting and +enhancing dynamic features from point clouds for Lidar-based gait recognition. +Specifically, it substitutes the conventional point cloud plane projection +method with spherical projection to augment the perception of dynamic +feature.Additionally, a network block named DAM-L is proposed to extract gait +cues from the projected point cloud data. We conducted extensive experiments +and the results demonstrated the SpheriGait achieved state-of-the-art +performance on the SUSTech1K dataset, and verified that the spherical +projection method can serve as a universal data preprocessing technique to +enhance the performance of other LiDAR-based gait recognition methods, +exhibiting exceptional flexibility and practicality. + +
+
+
+
+
+ + ☆ Distillation-free Scaling of Large SSMs for Images and Videos + + +
+ State-space models (SSMs), exemplified by S4, have introduced a novel context +modeling method by integrating state-space techniques into deep learning. +However, they struggle with global context modeling due to their +data-independent matrices. The Mamba model addressed this with data-dependent +variants via the S6 selective-scan algorithm, enhancing context modeling, +especially for long sequences. However, Mamba-based architectures are difficult +to scale with respect to the number of parameters, which is a major limitation +for vision applications. This paper addresses the scalability issue of large +SSMs for image classification and action recognition without requiring +additional techniques like knowledge distillation. We analyze the distinct +characteristics of Mamba-based and Attention-based models, proposing a +Mamba-Attention interleaved architecture that enhances scalability, robustness, +and performance. We demonstrate that the stable and efficient interleaved +architecture resolves the scalability issue of Mamba-based architectures for +images and videos and increases robustness to common artifacts like JPEG +compression. Our thorough evaluation on the ImageNet-1K, Kinetics-400 and +Something-Something-v2 benchmarks demonstrates that our approach improves the +accuracy of state-of-the-art Mamba-based architectures by up to $+1.7$. + +
+
+
+
+
+ + ☆ Physically-Based Photometric Bundle Adjustment in Non-Lambertian + Environments IROS 2024 + + +
+ Photometric bundle adjustment (PBA) is widely used in estimating the camera +pose and 3D geometry by assuming a Lambertian world. However, the assumption of +photometric consistency is often violated since the non-diffuse reflection is +common in real-world environments. The photometric inconsistency significantly +affects the reliability of existing PBA methods. To solve this problem, we +propose a novel physically-based PBA method. Specifically, we introduce the +physically-based weights regarding material, illumination, and light path. +These weights distinguish the pixel pairs with different levels of photometric +inconsistency. We also design corresponding models for material estimation +based on sequential images and illumination estimation based on point clouds. +In addition, we establish the first SLAM-related dataset of non-Lambertian +scenes with complete ground truth of illumination and material. Extensive +experiments demonstrated that our PBA method outperforms existing approaches in +accuracy. + +
+
+ comment: Accepted to 2024 IEEE/RSJ International Conference on Intelligent + Robots and Systems (IROS 2024) +
+
+
+
+
+ + ☆ NT-ViT: Neural Transcoding Vision Transformers for EEG-to-fMRI Synthesis ECCV24 + + +
+ This paper introduces the Neural Transcoding Vision Transformer (\modelname), +a generative model designed to estimate high-resolution functional Magnetic +Resonance Imaging (fMRI) samples from simultaneous Electroencephalography (EEG) +data. A key feature of \modelname is its Domain Matching (DM) sub-module which +effectively aligns the latent EEG representations with those of fMRI volumes, +enhancing the model's accuracy and reliability. Unlike previous methods that +tend to struggle with fidelity and reproducibility of images, \modelname +addresses these challenges by ensuring methodological integrity and +higher-quality reconstructions which we showcase through extensive evaluation +on two benchmark datasets; \modelname outperforms the current state-of-the-art +by a significant margin in both cases, e.g. achieving a $10\times$ reduction in +RMSE and a $3.14\times$ increase in SSIM on the Oddball dataset. An ablation +study also provides insights into the contribution of each component to the +model's overall effectiveness. This development is critical in offering a new +approach to lessen the time and financial constraints typically linked with +high-resolution brain imaging, thereby aiding in the swift and precise +diagnosis of neurological disorders. Although it is not a replacement for +actual fMRI but rather a step towards making such imaging more accessible, we +believe that it represents a pivotal advancement in clinical practice and +neuroscience research. Code is available at +\url{https://github.com/rom42pla/ntvit}. + +
+
+ comment: ECCV24 Workshop on Synthetic Data for Computer Vision +
+
+
+
+
+ + ☆ RaggeDi: Diffusion-based State Estimation of Disordered Rags, Sheets, + Towels and Blankets + + +
+ Cloth state estimation is an important problem in robotics. It is essential +for the robot to know the accurate state to manipulate cloth and execute tasks +such as robotic dressing, stitching, and covering/uncovering human beings. +However, estimating cloth state accurately remains challenging due to its high +flexibility and self-occlusion. This paper proposes a diffusion model-based +pipeline that formulates the cloth state estimation as an image generation +problem by representing the cloth state as an RGB image that describes the +point-wise translation (translation map) between a pre-defined flattened mesh +and the deformed mesh in a canonical space. Then we train a conditional +diffusion-based image generation model to predict the translation map based on +an observation. Experiments are conducted in both simulation and the real world +to validate the performance of our method. Results indicate that our method +outperforms two recent methods in both accuracy and speed. + +
+
+
+
+
+ + ☆ End-to-End Probabilistic Geometry-Guided Regression for 6DoF Object Pose + Estimation + + +
+ 6D object pose estimation is the problem of identifying the position and +orientation of an object relative to a chosen coordinate system, which is a +core technology for modern XR applications. State-of-the-art 6D object pose +estimators directly predict an object pose given an object observation. Due to +the ill-posed nature of the pose estimation problem, where multiple different +poses can correspond to a single observation, generating additional plausible +estimates per observation can be valuable. To address this, we reformulate the +state-of-the-art algorithm GDRNPP and introduce EPRO-GDR (End-to-End +Probabilistic Geometry-Guided Regression). Instead of predicting a single pose +per detection, we estimate a probability density distribution of the pose. +Using the evaluation procedure defined by the BOP (Benchmark for 6D Object Pose +Estimation) Challenge, we test our approach on four of its core datasets and +demonstrate superior quantitative results for EPRO-GDR on LM-O, YCB-V, and +ITODD. Our probabilistic solution shows that predicting a pose distribution +instead of a single pose can improve state-of-the-art single-view pose +estimation while providing the additional benefit of being able to sample +multiple meaningful pose candidates. + +
+
+
+
+
+ + ☆ EFCM: Efficient Fine-tuning on Compressed Models for deployment of large + models in medical image analysis + + +
+ The recent development of deep learning large models in medicine shows +remarkable performance in medical image analysis and diagnosis, but their large +number of parameters causes memory and inference latency challenges. Knowledge +distillation offers a solution, but the slide-level gradients cannot be +backpropagated for student model updates due to high-resolution pathological +images and slide-level labels. This study presents an Efficient Fine-tuning on +Compressed Models (EFCM) framework with two stages: unsupervised feature +distillation and fine-tuning. In the distillation stage, Feature Projection +Distillation (FPD) is proposed with a TransScan module for adaptive receptive +field adjustment to enhance the knowledge absorption capability of the student +model. In the slide-level fine-tuning stage, three strategies (Reuse CLAM, +Retrain CLAM, and End2end Train CLAM (ETC)) are compared. Experiments are +conducted on 11 downstream datasets related to three large medical models: +RETFound for retina, MRM for chest X-ray, and BROW for histopathology. The +experimental results demonstrate that the EFCM framework significantly improves +accuracy and efficiency in handling slide-level pathological image problems, +effectively addressing the challenges of deploying large medical models. +Specifically, it achieves a 4.33% increase in ACC and a 5.2% increase in AUC +compared to the large model BROW on the TCGA-NSCLC and TCGA-BRCA datasets. The +analysis of model inference efficiency highlights the high efficiency of the +distillation fine-tuning method. + +
+
+
+
+
+ + ☆ SymFace: Additional Facial Symmetry Loss for Deep Face Recognition WACV 2025 + + +
+ Over the past decade, there has been a steady advancement in enhancing face +recognition algorithms leveraging advanced machine learning methods. The role +of the loss function is pivotal in addressing face verification problems and +playing a game-changing role. These loss functions have mainly explored +variations among intra-class or inter-class separation. This research examines +the natural phenomenon of facial symmetry in the face verification problem. The +symmetry between the left and right hemi faces has been widely used in many +research areas in recent decades. This paper adopts this simple approach +judiciously by splitting the face image vertically into two halves. With the +assumption that the natural phenomena of facial symmetry can enhance face +verification methodology, we hypothesize that the two output embedding vectors +of split faces must project close to each other in the output embedding space. +Inspired by this concept, we penalize the network based on the disparity of +embedding of the symmetrical pair of split faces. Symmetrical loss has the +potential to minimize minor asymmetric features due to facial expression and +lightning conditions, hence significantly increasing the inter-class variance +among the classes and leading to more reliable face embedding. This loss +function propels any network to outperform its baseline performance across all +existing network architectures and configurations, enabling us to achieve SoTA +results. + +
+
+ comment: 11 Pages, 6 Figures, 5 Tables, Submitted for WACV 2025 +
+
+
+
+
+ + ☆ EventAug: Multifaceted Spatio-Temporal Data Augmentation Methods for + Event-based Learning + + +
+ The event camera has demonstrated significant success across a wide range of +areas due to its low time latency and high dynamic range. However, the +community faces challenges such as data deficiency and limited diversity, often +resulting in over-fitting and inadequate feature learning. Notably, the +exploration of data augmentation techniques in the event community remains +scarce. This work aims to address this gap by introducing a systematic +augmentation scheme named EventAug to enrich spatial-temporal diversity. In +particular, we first propose Multi-scale Temporal Integration (MSTI) to +diversify the motion speed of objects, then introduce Spatial-salient Event +Mask (SSEM) and Temporal-salient Event Mask (TSEM) to enrich object variants. +Our EventAug can facilitate models learning with richer motion patterns, object +variants and local spatio-temporal relations, thus improving model robustness +to varied moving speeds, occlusions, and action disruptions. Experiment results +show that our augmentation method consistently yields significant improvements +across different tasks and backbones (e.g., a 4.87% accuracy gain on DVS128 +Gesture). Our code will be publicly available for this community. + +
+
+
+
+
+ + ☆ Latent fingerprint enhancement for accurate minutiae detection + + +
+ Identification of suspects based on partial and smudged fingerprints, +commonly referred to as fingermarks or latent fingerprints, presents a +significant challenge in the field of fingerprint recognition. Although +fixed-length embeddings have shown effectiveness in recognising rolled and slap +fingerprints, the methods for matching latent fingerprints have primarily +centred around local minutiae-based embeddings, failing to fully exploit global +representations for matching purposes. Consequently, enhancing latent +fingerprints becomes critical to ensuring robust identification for forensic +investigations. Current approaches often prioritise restoring ridge patterns, +overlooking the fine-macroeconomic details crucial for accurate fingerprint +recognition. To address this, we propose a novel approach that uses generative +adversary networks (GANs) to redefine Latent Fingerprint Enhancement (LFE) +through a structured approach to fingerprint generation. By directly optimising +the minutiae information during the generation process, the model produces +enhanced latent fingerprints that exhibit exceptional fidelity to ground-truth +instances. This leads to a significant improvement in identification +performance. Our framework integrates minutiae locations and orientation +fields, ensuring the preservation of both local and structural fingerprint +features. Extensive evaluations conducted on two publicly available datasets +demonstrate our method's dominance over existing state-of-the-art techniques, +highlighting its potential to significantly enhance latent fingerprint +recognition accuracy in forensic applications. + +
+
+
+
+
+ + ☆ Efficient Low-Resolution Face Recognition via Bridge Distillation + + +
+ Face recognition in the wild is now advancing towards light-weight models, +fast inference speed and resolution-adapted capability. In this paper, we +propose a bridge distillation approach to turn a complex face model pretrained +on private high-resolution faces into a light-weight one for low-resolution +face recognition. In our approach, such a cross-dataset resolution-adapted +knowledge transfer problem is solved via two-step distillation. In the first +step, we conduct cross-dataset distillation to transfer the prior knowledge +from private high-resolution faces to public high-resolution faces and generate +compact and discriminative features. In the second step, the resolution-adapted +distillation is conducted to further transfer the prior knowledge to synthetic +low-resolution faces via multi-task learning. By learning low-resolution face +representations and mimicking the adapted high-resolution knowledge, a +light-weight student model can be constructed with high efficiency and +promising accuracy in recognizing low-resolution faces. Experimental results +show that the student model performs impressively in recognizing low-resolution +faces with only 0.21M parameters and 0.057MB memory. Meanwhile, its speed +reaches up to 14,705, ~934 and 763 faces per second on GPU, CPU and mobile +phone, respectively. + +
+
+ comment: This paper is published in IEEE TIP 2020 +
+
+
+
+
+ + ☆ Distilling Channels for Efficient Deep Tracking + + +
+ Deep trackers have proven success in visual tracking. Typically, these +trackers employ optimally pre-trained deep networks to represent all diverse +objects with multi-channel features from some fixed layers. The deep networks +employed are usually trained to extract rich knowledge from massive data used +in object classification and so they are capable to represent generic objects +very well. However, these networks are too complex to represent a specific +moving object, leading to poor generalization as well as high computational and +memory costs. This paper presents a novel and general framework termed channel +distillation to facilitate deep trackers. To validate the effectiveness of +channel distillation, we take discriminative correlation filter (DCF) and ECO +for example. We demonstrate that an integrated formulation can turn feature +compression, response map generation, and model update into a unified energy +minimization problem to adaptively select informative feature channels that +improve the efficacy of tracking moving objects on the fly. Channel +distillation can accurately extract good channels, alleviating the influence of +noisy channels and generally reducing the number of channels, as well as +adaptively generalizing to different channels and networks. The resulting deep +tracker is accurate, fast, and has low memory requirements. Extensive +experimental evaluations on popular benchmarks clearly demonstrate the +effectiveness and generalizability of our framework. + +
+
+ comment: Published by IEEE TIP 2020 +
+
+
+
+
+ + ☆ Knowledge Adaptation Network for Few-Shot Class-Incremental Learning + + +
+ Few-shot class-incremental learning (FSCIL) aims to incrementally recognize +new classes using a few samples while maintaining the performance on previously +learned classes. One of the effective methods to solve this challenge is to +construct prototypical evolution classifiers. Despite the advancement achieved +by most existing methods, the classifier weights are simply initialized using +mean features. Because representations for new classes are weak and biased, we +argue such a strategy is suboptimal. In this paper, we tackle this issue from +two aspects. Firstly, thanks to the development of foundation models, we employ +a foundation model, the CLIP, as the network pedestal to provide a general +representation for each class. Secondly, to generate a more reliable and +comprehensive instance representation, we propose a Knowledge Adapter (KA) +module that summarizes the data-specific knowledge from training data and fuses +it into the general representation. Additionally, to tune the knowledge learned +from the base classes to the upcoming classes, we propose a mechanism of +Incremental Pseudo Episode Learning (IPEL) by simulating the actual FSCIL. +Taken together, our proposed method, dubbed as Knowledge Adaptation Network +(KANet), achieves competitive performance on a wide range of datasets, +including CIFAR100, CUB200, and ImageNet-R. + +
+
+ comment: 13 pages;6 figures +
+
+
+
+
+ + ☆ Neural Encoding for Image Recall: Human-Like Memory + + +
+ Achieving human-like memory recall in artificial systems remains a +challenging frontier in computer vision. Humans demonstrate remarkable ability +to recall images after a single exposure, even after being shown thousands of +images. However, this capacity diminishes significantly when confronted with +non-natural stimuli such as random textures. In this paper, we present a method +inspired by human memory processes to bridge this gap between artificial and +biological memory systems. Our approach focuses on encoding images to mimic the +high-level information retained by the human brain, rather than storing raw +pixel data. By adding noise to images before encoding, we introduce variability +akin to the non-deterministic nature of human memory encoding. Leveraging +pre-trained models' embedding layers, we explore how different architectures +encode images and their impact on memory recall. Our method achieves impressive +results, with 97% accuracy on natural images and near-random performance (52%) +on textures. We provide insights into the encoding process and its implications +for machine learning memory systems, shedding light on the parallels between +human and artificial intelligence memory mechanisms. + +
+
+ comment: 5 pages, 7 figures +
+
+
+
+
+ + ☆ RockTrack: A 3D Robust Multi-Camera-Ken Multi-Object Tracking Framework + + +
+ 3D Multi-Object Tracking (MOT) obtains significant performance improvements +with the rapid advancements in 3D object detection, particularly in +cost-effective multi-camera setups. However, the prevalent end-to-end training +approach for multi-camera trackers results in detector-specific models, +limiting their versatility. Moreover, current generic trackers overlook the +unique features of multi-camera detectors, i.e., the unreliability of motion +observations and the feasibility of visual information. To address these +challenges, we propose RockTrack, a 3D MOT method for multi-camera detectors. +Following the Tracking-By-Detection framework, RockTrack is compatible with +various off-the-shelf detectors. RockTrack incorporates a confidence-guided +preprocessing module to extract reliable motion and image observations from +distinct representation spaces from a single detector. These observations are +then fused in an association module that leverages geometric and appearance +cues to minimize mismatches. The resulting matches are propagated through a +staged estimation process, forming the basis for heuristic noise modeling. +Additionally, we introduce a novel appearance similarity metric for explicitly +characterizing object affinities in multi-camera settings. RockTrack achieves +state-of-the-art performance on the nuScenes vision-only tracking leaderboard +with 59.1% AMOTA while demonstrating impressive computational efficiency. + +
+
+ comment: RockTrack establishes a new state-of-the-art with 59.1% AMOTA on the + nuScenes vision-only test leaderboard with ResNet50-level backbone +
+
+
+
+
+ + ☆ Exploring Gaze Pattern in Autistic Children: Clustering, Visualization, + and Prediction + + +
+ Autism Spectrum Disorder (ASD) significantly affects the social and +communication abilities of children, and eye-tracking is commonly used as a +diagnostic tool by identifying associated atypical gaze patterns. Traditional +methods demand manual identification of Areas of Interest in gaze patterns, +lowering the performance of gaze behavior analysis in ASD subjects. To tackle +this limitation, we propose a novel method to automatically analyze gaze +behaviors in ASD children with superior accuracy. To be specific, we first +apply and optimize seven clustering algorithms to automatically group gaze +points to compare ASD subjects with typically developing peers. Subsequently, +we extract 63 significant features to fully describe the patterns. These +features can describe correlations between ASD diagnosis and gaze patterns. +Lastly, using these features as prior knowledge, we train multiple predictive +machine learning models to predict and diagnose ASD based on their gaze +behaviors. To evaluate our method, we apply our method to three ASD datasets. +The experimental and visualization results demonstrate the improvements of +clustering algorithms in the analysis of unique gaze patterns in ASD children. +Additionally, these predictive machine learning models achieved +state-of-the-art prediction performance ($81\%$ AUC) in the field of +automatically constructed gaze point features for ASD diagnosis. Our code is +available at \url{https://github.com/username/projectname}. + +
+
+
+
+
+ + ☆ InverseMeetInsert: Robust Real Image Editing via Geometric Accumulation + Inversion in Guided Diffusion Models + + +
+ In this paper, we introduce Geometry-Inverse-Meet-Pixel-Insert, short for +GEO, an exceptionally versatile image editing technique designed to cater to +customized user requirements at both local and global scales. Our approach +seamlessly integrates text prompts and image prompts to yield diverse and +precise editing outcomes. Notably, our method operates without the need for +training and is driven by two key contributions: (i) a novel geometric +accumulation loss that enhances DDIM inversion to faithfully preserve pixel +space geometry and layout, and (ii) an innovative boosted image prompt +technique that combines pixel-level editing for text-only inversion with latent +space geometry guidance for standard classifier-free reversion. Leveraging the +publicly available Stable Diffusion model, our approach undergoes extensive +evaluation across various image types and challenging prompt editing scenarios, +consistently delivering high-fidelity editing results for real images. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ DETECLAP: Enhancing Audio-Visual Representation Learning with Object + Information + + +
+ Current audio-visual representation learning can capture rough object +categories (e.g., ``animals'' and ``instruments''), but it lacks the ability to +recognize fine-grained details, such as specific categories like ``dogs'' and +``flutes'' within animals and instruments. To address this issue, we introduce +DETECLAP, a method to enhance audio-visual representation learning with object +information. Our key idea is to introduce an audio-visual label prediction loss +to the existing Contrastive Audio-Visual Masked AutoEncoder to enhance its +object awareness. To avoid costly manual annotations, we prepare object labels +from both audio and visual inputs using state-of-the-art language-audio models +and object detectors. We evaluate the method of audio-visual retrieval and +classification using the VGGSound and AudioSet20K datasets. Our method achieves +improvements in recall@10 of +1.5% and +1.2% for audio-to-visual and +visual-to-audio retrieval, respectively, and an improvement in accuracy of ++0.6% for audio-visual classification. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ LFIC-DRASC: Deep Light Field Image Compression Using Disentangled + Representation and Asymmetrical Strip Convolution + + +
+ Light-Field (LF) image is emerging 4D data of light rays that is capable of +realistically presenting spatial and angular information of 3D scene. However, +the large data volume of LF images becomes the most challenging issue in +real-time processing, transmission, and storage. In this paper, we propose an +end-to-end deep LF Image Compression method Using Disentangled Representation +and Asymmetrical Strip Convolution (LFIC-DRASC) to improve coding efficiency. +Firstly, we formulate the LF image compression problem as learning a +disentangled LF representation network and an image encoding-decoding network. +Secondly, we propose two novel feature extractors that leverage the structural +prior of LF data by integrating features across different dimensions. +Meanwhile, disentangled LF representation network is proposed to enhance the LF +feature disentangling and decoupling. Thirdly, we propose the LFIC-DRASC for LF +image compression, where two Asymmetrical Strip Convolution (ASC) operators, +i.e. horizontal and vertical, are proposed to capture long-range correlation in +LF feature space. These two ASC operators can be combined with the square +convolution to further decouple LF features, which enhances the model ability +in representing intricate spatial relationships. Experimental results +demonstrate that the proposed LFIC-DRASC achieves an average of 20.5\% bit rate +reductions comparing with the state-of-the-art methods. + +
+
+
+
+
+ + ☆ RopeBEV: A Multi-Camera Roadside Perception Network in Bird's-Eye-View + + +
+ Multi-camera perception methods in Bird's-Eye-View (BEV) have gained wide +application in autonomous driving. However, due to the differences between +roadside and vehicle-side scenarios, there currently lacks a multi-camera BEV +solution in roadside. This paper systematically analyzes the key challenges in +multi-camera BEV perception for roadside scenarios compared to vehicle-side. +These challenges include the diversity in camera poses, the uncertainty in +Camera numbers, the sparsity in perception regions, and the ambiguity in +orientation angles. In response, we introduce RopeBEV, the first dense +multi-camera BEV approach. RopeBEV introduces BEV augmentation to address the +training balance issues caused by diverse camera poses. By incorporating +CamMask and ROIMask (Region of Interest Mask), it supports variable camera +numbers and sparse perception, respectively. Finally, camera rotation embedding +is utilized to resolve orientation ambiguity. Our method ranks 1st on the +real-world highway dataset RoScenes and demonstrates its practical value on a +private urban dataset that covers more than 50 intersections and 600 cameras. + +
+
+
+
+
+ + ☆ Discovering Conceptual Knowledge with Analytic Ontology Templates for + Articulated Objects + + +
+ Human cognition can leverage fundamental conceptual knowledge, like geometric +and kinematic ones, to appropriately perceive, comprehend and interact with +novel objects. Motivated by this finding, we aim to endow machine intelligence +with an analogous capability through performing at the conceptual level, in +order to understand and then interact with articulated objects, especially for +those in novel categories, which is challenging due to the intricate geometric +structures and diverse joint types of articulated objects. To achieve this +goal, we propose Analytic Ontology Template (AOT), a parameterized and +differentiable program description of generalized conceptual ontologies. A +baseline approach called AOTNet driven by AOTs is designed accordingly to equip +intelligent agents with these generalized concepts, and then empower the agents +to effectively discover the conceptual knowledge on the structure and +affordance of articulated objects. The AOT-driven approach yields benefits in +three key perspectives: i) enabling concept-level understanding of articulated +objects without relying on any real training data, ii) providing analytic +structure information, and iii) introducing rich affordance information +indicating proper ways of interaction. We conduct exhaustive experiments and +the results demonstrate the superiority of our approach in understanding and +then interacting with articulated objects. + +
+
+
+
+
+ + ☆ ORB-SfMLearner: ORB-Guided Self-supervised Visual Odometry with + Selective Online Adaptation + + +
+ Deep visual odometry, despite extensive research, still faces limitations in +accuracy and generalizability that prevent its broader application. To address +these challenges, we propose an Oriented FAST and Rotated BRIEF (ORB)-guided +visual odometry with selective online adaptation named ORB-SfMLearner. We +present a novel use of ORB features for learning-based ego-motion estimation, +leading to more robust and accurate results. We also introduce the +cross-attention mechanism to enhance the explainability of PoseNet and have +revealed that driving direction of the vehicle can be explained through +attention weights, marking a novel exploration in this area. To improve +generalizability, our selective online adaptation allows the network to rapidly +and selectively adjust to the optimal parameters across different domains. +Experimental results on KITTI and vKITTI datasets show that our method +outperforms previous state-of-the-art deep visual odometry methods in terms of +ego-motion accuracy and generalizability. + +
+
+
+
+
+ + ☆ GUNet: A Graph Convolutional Network United Diffusion Model for Stable + and Diversity Pose Generation + + +
+ Pose skeleton images are an important reference in pose-controllable image +generation. In order to enrich the source of skeleton images, recent works have +investigated the generation of pose skeletons based on natural language. These +methods are based on GANs. However, it remains challenging to perform diverse, +structurally correct and aesthetically pleasing human pose skeleton generation +with various textual inputs. To address this problem, we propose a framework +with GUNet as the main model, PoseDiffusion. It is the first generative +framework based on a diffusion model and also contains a series of variants +fine-tuned based on a stable diffusion model. PoseDiffusion demonstrates +several desired properties that outperform existing methods. 1) Correct +Skeletons. GUNet, a denoising model of PoseDiffusion, is designed to +incorporate graphical convolutional neural networks. It is able to learn the +spatial relationships of the human skeleton by introducing skeletal information +during the training process. 2) Diversity. We decouple the key points of the +skeleton and characterise them separately, and use cross-attention to introduce +textual conditions. Experimental results show that PoseDiffusion outperforms +existing SoTA algorithms in terms of stability and diversity of text-driven +pose skeleton generation. Qualitative analyses further demonstrate its +superiority for controllable generation in Stable Diffusion. + +
+
+
+
+
+ + ☆ SLAM assisted 3D tracking system for laparoscopic surgery + + +
+ A major limitation of minimally invasive surgery is the difficulty in +accurately locating the internal anatomical structures of the target organ due +to the lack of tactile feedback and transparency. Augmented reality (AR) offers +a promising solution to overcome this challenge. Numerous studies have shown +that combining learning-based and geometric methods can achieve accurate +preoperative and intraoperative data registration. This work proposes a +real-time monocular 3D tracking algorithm for post-registration tasks. The +ORB-SLAM2 framework is adopted and modified for prior-based 3D tracking. The +primitive 3D shape is used for fast initialization of the monocular SLAM. A +pseudo-segmentation strategy is employed to separate the target organ from the +background for tracking purposes, and the geometric prior of the 3D shape is +incorporated as an additional constraint in the pose graph. Experiments from +in-vivo and ex-vivo tests demonstrate that the proposed 3D tracking system +provides robust 3D tracking and effectively handles typical challenges such as +fast motion, out-of-field-of-view scenarios, partial visibility, and +"organ-background" relative motion. + +
+
+ comment: Demo: https://youtu.be/B1xZW8bj3cM +
+
+
+
+
+ + ☆ Detecting Underdiagnosed Medical Conditions with Deep Learning-Based + Opportunistic CT Imaging + + +
+ Abdominal computed tomography (CT) scans are frequently performed in clinical +settings. Opportunistic CT involves repurposing routine CT images to extract +diagnostic information and is an emerging tool for detecting underdiagnosed +conditions such as sarcopenia, hepatic steatosis, and ascites. This study +utilizes deep learning methods to promote accurate diagnosis and clinical +documentation. We analyze 2,674 inpatient CT scans to identify discrepancies +between imaging phenotypes (characteristics derived from opportunistic CT +scans) and their corresponding documentation in radiology reports and ICD +coding. Through our analysis, we find that only 0.5%, 3.2%, and 30.7% of scans +diagnosed with sarcopenia, hepatic steatosis, and ascites (respectively) +through either opportunistic imaging or radiology reports were ICD-coded. Our +findings demonstrate opportunistic CT's potential to enhance diagnostic +precision and accuracy of risk adjustment models, offering advancements in +precision medicine. + +
+
+
+
+
+ + ☆ SRIF: Semantic Shape Registration Empowered by Diffusion-based Image + Morphing and Flow Estimation + + +
+ In this paper, we propose SRIF, a novel Semantic shape Registration framework +based on diffusion-based Image morphing and Flow estimation. More concretely, +given a pair of extrinsically aligned shapes, we first render them from +multi-views, and then utilize an image interpolation framework based on +diffusion models to generate sequences of intermediate images between them. The +images are later fed into a dynamic 3D Gaussian splatting framework, with which +we reconstruct and post-process for intermediate point clouds respecting the +image morphing processing. In the end, tailored for the above, we propose a +novel registration module to estimate continuous normalizing flow, which +deforms source shape consistently towards the target, with intermediate point +clouds as weak guidance. Our key insight is to leverage large vision models +(LVMs) to associate shapes and therefore obtain much richer semantic +information on the relationship between shapes than the ad-hoc feature +extraction and alignment. As a consequence, SRIF achieves high-quality dense +correspondences on challenging shape pairs, but also delivers smooth, +semantically meaningful interpolation in between. Empirical evidence justifies +the effectiveness and superiority of our method as well as specific design +choices. The code is released at https://github.com/rqhuang88/SRIF. + +
+
+
+
+
+ + ☆ Gradient-Driven 3D Segmentation and Affordance Transfer in Gaussian + Splatting Using 2D Masks ICRA 2025 + + +
+ 3D Gaussian Splatting has emerged as a powerful 3D scene representation +technique, capturing fine details with high efficiency. In this paper, we +introduce a novel voting-based method that extends 2D segmentation models to 3D +Gaussian splats. Our approach leverages masked gradients, where gradients are +filtered by input 2D masks, and these gradients are used as votes to achieve +accurate segmentation. As a byproduct, we discovered that inference-time +gradients can also be used to prune Gaussians, resulting in up to 21% +compression. Additionally, we explore few-shot affordance transfer, allowing +annotations from 2D images to be effectively transferred onto 3D Gaussian +splats. The robust yet straightforward mathematical formulation underlying this +approach makes it a highly effective tool for numerous downstream applications, +such as augmented reality (AR), object editing, and robotics. The project code +and additional resources are available at +https://jojijoseph.github.io/3dgs-segmentation. + +
+
+ comment: Preprint, Under review for ICRA 2025 +
+
+
+
+
+ + ☆ Agent Aggregator with Mask Denoise Mechanism for Histopathology Whole + Slide Image Analysis + + +
+ Histopathology analysis is the gold standard for medical diagnosis. Accurate +classification of whole slide images (WSIs) and region-of-interests (ROIs) +localization can assist pathologists in diagnosis. The gigapixel resolution of +WSI and the absence of fine-grained annotations make direct classification and +analysis challenging. In weakly supervised learning, multiple instance learning +(MIL) presents a promising approach for WSI classification. The prevailing +strategy is to use attention mechanisms to measure instance importance for +classification. However, attention mechanisms fail to capture inter-instance +information, and self-attention causes quadratic computational complexity. To +address these challenges, we propose AMD-MIL, an agent aggregator with a mask +denoise mechanism. The agent token acts as an intermediate variable between the +query and key for computing instance importance. Mask and denoising matrices, +mapped from agents-aggregated value, dynamically mask low-contribution +representations and eliminate noise. AMD-MIL achieves better attention +allocation by adjusting feature representations, capturing micro-metastases in +cancer, and improving interpretability. Extensive experiments on CAMELYON-16, +CAMELYON-17, TCGA-KIDNEY, and TCGA-LUNG show AMD-MIL's superiority over +state-of-the-art methods. + +
+
+
+
+
+ + ☆ Bridging Domain Gap for Flight-Ready Spaceborne Vision + + +
+ This work presents Spacecraft Pose Network v3 (SPNv3), a Neural Network (NN) +for monocular pose estimation of a known, non-cooperative target spacecraft. As +opposed to existing literature, SPNv3 is designed and trained to be +computationally efficient while providing robustness to spaceborne images that +have not been observed during offline training and validation on the ground. +These characteristics are essential to deploying NNs on space-grade edge +devices. They are achieved through careful NN design choices, and an extensive +trade-off analysis reveals features such as data augmentation, transfer +learning and vision transformer architecture as a few of those that contribute +to simultaneously maximizing robustness and minimizing computational overhead. +Experiments demonstrate that the final SPNv3 can achieve state-of-the-art pose +accuracy on hardware-in-the-loop images from a robotic testbed while having +trained exclusively on computer-generated synthetic images, effectively +bridging the domain gap between synthetic and real imagery. At the same time, +SPNv3 runs well above the update frequency of modern satellite navigation +filters when tested on a representative graphical processing unit system with +flight heritage. Overall, SPNv3 is an efficient, flight-ready NN model readily +applicable to a wide range of close-range rendezvous and proximity operations +with target resident space objects. The code implementation of SPNv3 will be +made publicly available. + +
+
+ comment: Submitted to Journal of Spacecraft and Rockets; Appeared as Chapter 4 + of Tae Ha Park's PhD thesis +
+
+
+
+
+ + ☆ VL-Reader: Vision and Language Reconstructor is an Effective Scene Text + Recognizer + + +
+ Text recognition is an inherent integration of vision and language, +encompassing the visual texture in stroke patterns and the semantic context +among the character sequences. Towards advanced text recognition, there are +three key challenges: (1) an encoder capable of representing the visual and +semantic distributions; (2) a decoder that ensures the alignment between vision +and semantics; and (3) consistency in the framework during pre-training, if it +exists, and fine-tuning. Inspired by masked autoencoding, a successful +pre-training strategy in both vision and language, we propose an innovative +scene text recognition approach, named VL-Reader. The novelty of the VL-Reader +lies in the pervasive interplay between vision and language throughout the +entire process. Concretely, we first introduce a Masked Visual-Linguistic +Reconstruction (MVLR) objective, which aims at simultaneously modeling visual +and linguistic information. Then, we design a Masked Visual-Linguistic Decoder +(MVLD) to further leverage masked vision-language context and achieve bi-modal +feature interaction. The architecture of VL-Reader maintains consistency from +pre-training to fine-tuning. In the pre-training stage, VL-Reader reconstructs +both masked visual and text tokens, while in the fine-tuning stage, the network +degrades to reconstruct all characters from an image without any masked +regions. VL-reader achieves an average accuracy of 97.1% on six typical +datasets, surpassing the SOTA by 1.1%. The improvement was even more +significant on challenging datasets. The results demonstrate that vision and +language reconstructor can serve as an effective scene text recognizer. + +
+
+ comment: Accepted by ACM-MM2024 +
+
+
+
+
+ + ☆ Enhancing Semi-Supervised Learning via Representative and Diverse Sample + Selection + + +
+ Semi-Supervised Learning (SSL) has become a preferred paradigm in many deep +learning tasks, which reduces the need for human labor. Previous studies +primarily focus on effectively utilising the labelled and unlabeled data to +improve performance. However, we observe that how to select samples for +labelling also significantly impacts performance, particularly under extremely +low-budget settings. The sample selection task in SSL has been under-explored +for a long time. To fill in this gap, we propose a Representative and Diverse +Sample Selection approach (RDSS). By adopting a modified Frank-Wolfe algorithm +to minimise a novel criterion $\alpha$-Maximum Mean Discrepancy ($\alpha$-MMD), +RDSS samples a representative and diverse subset for annotation from the +unlabeled data. We demonstrate that minimizing $\alpha$-MMD enhances the +generalization ability of low-budget learning. Experimental results show that +RDSS consistently improves the performance of several popular SSL frameworks +and outperforms the state-of-the-art sample selection approaches used in Active +Learning (AL) and Semi-Supervised Active Learning (SSAL), even with constrained +annotation budgets. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Relax DARTS: Relaxing the Constraints of Differentiable Architecture + Search for Eye Movement Recognition + + +
+ Eye movement biometrics is a secure and innovative identification method. +Deep learning methods have shown good performance, but their network +architecture relies on manual design and combined priori knowledge. To address +these issues, we introduce automated network search (NAS) algorithms to the +field of eye movement recognition and present Relax DARTS, which is an +improvement of the Differentiable Architecture Search (DARTS) to realize more +efficient network search and training. The key idea is to circumvent the issue +of weight sharing by independently training the architecture parameters +$\alpha$ to achieve a more precise target architecture. Moreover, the +introduction of module input weights $\beta$ allows cells the flexibility to +select inputs, to alleviate the overfitting phenomenon and improve the model +performance. Results on four public databases demonstrate that the Relax DARTS +achieves state-of-the-art recognition performance. Notably, Relax DARTS +exhibits adaptability to other multi-feature temporal classification tasks. + +
+
+ comment: Accepted By CCBR 2024 +
+
+
+
+
+ + ☆ Few-Shot Learning Approach on Tuberculosis Classification Based on Chest + X-Ray Images + + +
+ Tuberculosis (TB) is caused by the bacterium Mycobacterium tuberculosis, +primarily affecting the lungs. Early detection is crucial for improving +treatment effectiveness and reducing transmission risk. Artificial intelligence +(AI), particularly through image classification of chest X-rays, can assist in +TB detection. However, class imbalance in TB chest X-ray datasets presents a +challenge for accurate classification. In this paper, we propose a few-shot +learning (FSL) approach using the Prototypical Network algorithm to address +this issue. We compare the performance of ResNet-18, ResNet-50, and VGG16 in +feature extraction from the TBX11K Chest X-ray dataset. Experimental results +demonstrate classification accuracies of 98.93% for ResNet-18, 98.60% for +ResNet-50, and 33.33% for VGG16. These findings indicate that the proposed +method outperforms others in mitigating data imbalance, which is particularly +beneficial for disease classification applications. + +
+
+ comment: 6 pages. Pre-print +
+
+
+
+
+ + ☆ DAF-Net: A Dual-Branch Feature Decomposition Fusion Network with Domain + Adaptive for Infrared and Visible Image Fusion + + +
+ Infrared and visible image fusion aims to combine complementary information +from both modalities to provide a more comprehensive scene understanding. +However, due to the significant differences between the two modalities, +preserving key features during the fusion process remains a challenge. To +address this issue, we propose a dual-branch feature decomposition fusion +network (DAF-Net) with domain adaptive, which introduces Multi-Kernel Maximum +Mean Discrepancy (MK-MMD) into the base encoder and designs a hybrid kernel +function suitable for infrared and visible image fusion. The base encoder built +on the Restormer network captures global structural information while the +detail encoder based on Invertible Neural Networks (INN) focuses on extracting +detail texture information. By incorporating MK-MMD, the DAF-Net effectively +aligns the latent feature spaces of visible and infrared images, thereby +improving the quality of the fused images. Experimental results demonstrate +that the proposed method outperforms existing techniques across multiple +datasets, significantly enhancing both visual quality and fusion performance. +The related Python code is available at https://github.com/xujian000/DAF-Net. + +
+
+ comment: 5pages,4figures +
+
+
+
+
+ + ☆ PainDiffusion: Can robot express pain? + + +
+ Pain is a more intuitive and user-friendly way of communicating problems, +making it especially useful in rehabilitation nurse training robots. While most +previous methods have focused on classifying or recognizing pain expressions, +these approaches often result in unnatural, jiggling robot faces. We introduce +PainDiffusion, a model that generates facial expressions in response to pain +stimuli, with controllable pain expressiveness and emotion status. +PainDiffusion leverages diffusion forcing to roll out predictions over +arbitrary lengths using a conditioned temporal U-Net. It operates as a latent +diffusion model within EMOCA's facial expression latent space, ensuring a +compact data representation and quick rendering time. For training data, we +process the BioVid Heatpain Database, extracting expression codes and subject +identity configurations. We also propose a novel set of metrics to evaluate +pain expressions, focusing on expressiveness, diversity, and the +appropriateness of model-generated outputs. Finally, we demonstrate that +PainDiffusion outperforms the autoregressive method, both qualitatively and +quantitatively. Code, videos, and further analysis are available at: +\href{https://damtien444.github.io/paindf/}{https://damtien444.github.io/paindf/}. + +
+
+ comment: Under reviewing +
+
+
+
+
+ + ☆ Multimodal Generalized Category Discovery + + +
+ Generalized Category Discovery (GCD) aims to classify inputs into both known +and novel categories, a task crucial for open-world scientific discoveries. +However, current GCD methods are limited to unimodal data, overlooking the +inherently multimodal nature of most real-world data. In this work, we extend +GCD to a multimodal setting, where inputs from different modalities provide +richer and complementary information. Through theoretical analysis and +empirical validation, we identify that the key challenge in multimodal GCD lies +in effectively aligning heterogeneous information across modalities. To address +this, we propose MM-GCD, a novel framework that aligns both the feature and +output spaces of different modalities using contrastive learning and +distillation techniques. MM-GCD achieves new state-of-the-art performance on +the UPMC-Food101 and N24News datasets, surpassing previous methods by 11.5\% +and 4.7\%, respectively. + +
+
+
+
+
+ + ☆ Hyperspectral Image Classification Based on Faster Residual Multi-branch + Spiking Neural Network + + +
+ Convolutional neural network (CNN) performs well in Hyperspectral Image (HSI) +classification tasks, but its high energy consumption and complex network +structure make it difficult to directly apply it to edge computing devices. At +present, spiking neural networks (SNN) have developed rapidly in HSI +classification tasks due to their low energy consumption and event driven +characteristics. However, it usually requires a longer time step to achieve +optimal accuracy. In response to the above problems, this paper builds a +spiking neural network (SNN-SWMR) based on the leaky integrate-and-fire (LIF) +neuron model for HSI classification tasks. The network uses the spiking width +mixed residual (SWMR) module as the basic unit to perform feature extraction +operations. The spiking width mixed residual module is composed of spiking +mixed convolution (SMC), which can effectively extract spatial-spectral +features. Secondly, this paper designs a simple and efficient arcsine +approximate derivative (AAD), which solves the non-differentiable problem of +spike firing by fitting the Dirac function. Through AAD, we can directly train +supervised spike neural networks. Finally, this paper conducts comparative +experiments with multiple advanced HSI classification algorithms based on +spiking neural networks on six public hyperspectral data sets. Experimental +results show that the AAD function has strong robustness and a good fitting +effect. Meanwhile, compared with other algorithms, SNN-SWMR requires a time +step reduction of about 84%, training time, and testing time reduction of about +63% and 70% at the same accuracy. This study solves the key problem of SNN +based HSI classification algorithms, which has important practical significance +for promoting the practical application of HSI classification algorithms in +edge devices such as spaceborne and airborne devices. + +
+
+ comment: 15pages,12figures +
+
+
+
+
+ + ♻ ☆ TK-Planes: Tiered K-Planes with High Dimensional Feature Vectors for + Dynamic UAV-based Scenes ICRA2025 + + +
+ In this paper, we present a new approach to bridge the domain gap between +synthetic and real-world data for unmanned aerial vehicle (UAV)-based +perception. Our formulation is designed for dynamic scenes, consisting of small +moving objects or human actions. We propose an extension of K-Planes Neural +Radiance Field (NeRF), wherein our algorithm stores a set of tiered feature +vectors. The tiered feature vectors are generated to effectively model +conceptual information about a scene as well as an image decoder that +transforms output feature maps into RGB images. Our technique leverages the +information amongst both static and dynamic objects within a scene and is able +to capture salient scene attributes of high altitude videos. We evaluate its +performance on challenging datasets, including Okutama Action and UG2, and +observe considerable improvement in accuracy over state of the art neural +rendering methods. + +
+
+ comment: 8 pages, submitted to ICRA2025 +
+
+
+
+
+ + ♻ ☆ VideoClusterNet: Self-Supervised and Adaptive Face Clustering For Videos ECCV + + +
+ With the rise of digital media content production, the need for analyzing +movies and TV series episodes to locate the main cast of characters precisely +is gaining importance.Specifically, Video Face Clustering aims to group +together detected video face tracks with common facial identities. This problem +is very challenging due to the large range of pose, expression, appearance, and +lighting variations of a given face across video frames. Generic pre-trained +Face Identification (ID) models fail to adapt well to the video production +domain, given its high dynamic range content and also unique cinematic style. +Furthermore, traditional clustering algorithms depend on hyperparameters +requiring individual tuning across datasets. In this paper, we present a novel +video face clustering approach that learns to adapt a generic face ID model to +new video face tracks in a fully self-supervised fashion. We also propose a +parameter-free clustering algorithm that is capable of automatically adapting +to the finetuned model's embedding space for any input video. Due to the lack +of comprehensive movie face clustering benchmarks, we also present a +first-of-kind movie dataset: MovieFaceCluster. Our dataset is handpicked by +film industry professionals and contains extremely challenging face ID +scenarios. Experiments show our method's effectiveness in handling difficult +mainstream movie scenes on our benchmark dataset and state-of-the-art +performance on traditional TV series datasets. + +
+
+ comment: Accepted at European Conference on Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ♻ ☆ High-Resolution Maps of Left Atrial Displacements and Strains Estimated + with 3D Cine MRI using Online Learning Neural Networks + + +
+ The functional analysis of the left atrium (LA) is important for evaluating +cardiac health and understanding diseases like atrial fibrillation. Cine MRI is +ideally placed for the detailed 3D characterization of LA motion and +deformation but is lacking appropriate acquisition and analysis tools. Here, we +propose tools for the Analysis for Left Atrial Displacements and DeformatIons +using online learning neural Networks (Aladdin) and present a technical +feasibility study on how Aladdin can characterize 3D LA function globally and +regionally. Aladdin includes an online segmentation and image registration +network, and a strain calculation pipeline tailored to the LA. We create maps +of LA Displacement Vector Field (DVF) magnitude and LA principal strain values +from images of 10 healthy volunteers and 8 patients with cardiovascular disease +(CVD), of which 2 had large left ventricular ejection fraction (LVEF) +impairment. We additionally create an atlas of these biomarkers using the data +from the healthy volunteers. Results showed that Aladdin can accurately track +the LA wall across the cardiac cycle and characterize its motion and +deformation. Global LA function markers assessed with Aladdin agree well with +estimates from 2D Cine MRI. A more marked active contraction phase was observed +in the healthy cohort, while the CVD LVEF group showed overall reduced LA +function. Aladdin is uniquely able to identify LA regions with abnormal +deformation metrics that may indicate focal pathology. We expect Aladdin to +have important clinical applications as it can non-invasively characterize +atrial pathophysiology. All source code and data are available at: +https://github.com/cgalaz01/aladdin_cmr_la. + +
+
+
+
+
+ + ♻ ☆ Comparison of Two Augmentation Methods in Improving Detection Accuracy + of Hemarthrosis + + +
+ With the increase of computing power, machine learning models in medical +imaging have been introduced to help in rending medical diagnosis and +inspection, like hemophilia, a rare disorder in which blood cannot clot +normally. Often, one of the bottlenecks of detecting hemophilia is the lack of +data available to train the algorithm to increase the accuracy. As a possible +solution, this research investigated whether introducing augmented data by data +synthesis or traditional augmentation techniques can improve model accuracy, +helping to diagnose the diseases. To tackle this research, features of +ultrasound images were extracted by the pre-trained VGG-16, and similarities +were compared by cosine similarity measure based on extracted features in +different distributions among real images, synthetic images, and augmentation +images (Real vs. Real, Syn vs. Syn, Real vs. Different Batches of Syn, Real vs. +Augmentation Techniques). Model testing performance was investigated using +EffientNet-B4 to recognize "blood" images with two augmentation methods. In +addition, a gradient-weighted class activation mapping (Grad-CAM) visualization +was used to interpret the unexpected results like loss of accuracy. Synthetic +and real images do not show high similarity, with a mean similarity score of +0.4737. Synthetic batch 1 dataset and images by horizontal flip are more +similar to the original images. Classic augmentation techniques and data +synthesis can improve model accuracy, and data by traditional augmentation +techniques have a better performance than synthetic data. In addition, the +Grad-CAM heatmap figured out the loss of accuracy is due to a shift in the +domain. Overall, this research found that two augmentation methods, data +synthesis and traditional augmentation techniques, both can improve accuracy to +a certain extent to help to diagnose rare diseases. + +
+
+
+
+
+ + ♻ ☆ Efficient 3D Instance Mapping and Localization with Neural Fields + + +
+ We tackle the problem of learning an implicit scene representation for 3D +instance segmentation from a sequence of posed RGB images. Towards this, we +introduce 3DIML, a novel framework that efficiently learns a neural label field +which can render 3D instance segmentation masks from novel viewpoints. Opposed +to prior art that optimizes a neural field in a self-supervised manner, +requiring complicated training procedures and loss function design, 3DIML +leverages a two-phase process. The first phase, InstanceMap, takes as input 2D +segmentation masks of the image sequence generated by a frontend instance +segmentation model, and associates corresponding masks across images to 3D +labels. These almost 3D-consistent pseudolabel masks are then used in the +second phase, InstanceLift, to supervise the training of a neural label field, +which interpolates regions missed by InstanceMap and resolves ambiguities. +Additionally, we introduce InstanceLoc, which enables near realtime +localization of instance masks given a trained neural label field. We evaluate +3DIML on sequences from the Replica and ScanNet datasets and demonstrate its +effectiveness under mild assumptions for the image sequences. We achieve a +large practical speedup over existing implicit scene representation methods +with comparable quality, showcasing its potential to facilitate faster and more +effective 3D scene understanding. + +
+
+
+
+
+ + ♻ ☆ Mitigating Urban-Rural Disparities in Contrastive Representation + Learning with Satellite Imagery + + +
+ Satellite imagery is being leveraged for many societally critical tasks +across climate, economics, and public health. Yet, because of heterogeneity in +landscapes (e.g. how a road looks in different places), models can show +disparate performance across geographic areas. Given the important potential of +disparities in algorithmic systems used in societal contexts, here we consider +the risk of urban-rural disparities in identification of land-cover features. +This is via semantic segmentation (a common computer vision task in which image +regions are labelled according to what is being shown) which uses pre-trained +image representations generated via contrastive self-supervised learning. We +propose fair dense representation with contrastive learning (FairDCL) as a +method for de-biasing the multi-level latent space of convolution neural +network models. The method improves feature identification by removing spurious +model representations which are disparately distributed across urban and rural +areas, and is achieved in an unsupervised way by contrastive pre-training. The +obtained image representation mitigates downstream urban-rural prediction +disparities and outperforms state-of-the-art baselines on real-world satellite +images. Embedding space evaluation and ablation studies further demonstrate +FairDCL's robustness. As generalizability and robustness in geographic imagery +is a nascent topic, our work motivates researchers to consider metrics beyond +average accuracy in such applications. + +
+
+
+
+
+ + ♻ ☆ Checklist to Define the Identification of TP, FP, and FN Object + Detections in Automated Driving + + +
+ The object perception of automated driving systems must pass quality and +robustness tests before a safe deployment. Such tests typically identify true +positive (TP), false-positive (FP), and false-negative (FN) detections and +aggregate them to metrics. Since the literature seems to be lacking a +comprehensive way to define the identification of TPs/FPs/FNs, this paper +provides a checklist of relevant functional aspects and implementation details. +Besides labeling policies of the test set, we cover areas of vision, occlusion +handling, safety-relevant areas, matching criteria, temporal and probabilistic +issues, and further aspects. Even though the checklist cannot be fully +formalized, it can help practitioners minimize the ambiguity of their tests, +which, in turn, makes statements on object perception more reliable and +comparable. + +
+
+ comment: This version improves the checklist's usability by providing bullet + points to follow. It also condenses the contributions to safety assurance + down to the "Related Work" section. 11 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Continual Learning: Forget-free Winning Subnetworks for Video + Representations + + +
+ Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the +existence of efficient subnetworks within larger, dense networks, a +high-performing Winning Subnetwork (WSN) in terms of task performance under +appropriate sparsity conditions is considered for various continual learning +tasks. It leverages pre-existing weights from dense networks to achieve +efficient learning in Task Incremental Learning (TIL) and Task-agnostic +Incremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning +(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is +designed to prevent overfitting when the data samples are scarce. Furthermore, +the sparse reuse of WSN weights is considered for Video Incremental Learning +(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It +enables compact encoding of videos and identifies reusable subnetworks across +varying bandwidths. We have integrated FSO into different architectural +frameworks for continual learning, including VIL, TIL, and FSCIL. Our +comprehensive experiments demonstrate FSO's effectiveness, significantly +improving task performance at various convolutional representational levels. +Specifically, FSO enhances higher-layer performance in TIL and FSCIL and +lower-layer performance in VIL. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.14962, + arXiv:2306.11305 +
+
+
+
+
+ + ♻ ☆ Inverse Problems with Diffusion Models: A MAP Estimation Perspective + + +
+ Inverse problems have many applications in science and engineering. In +Computer vision, several image restoration tasks such as inpainting, +deblurring, and super-resolution can be formally modeled as inverse problems. +Recently, methods have been developed for solving inverse problems that only +leverage a pre-trained unconditional diffusion model and do not require +additional task-specific training. In such methods, however, the inherent +intractability of determining the conditional score function during the reverse +diffusion process poses a real challenge, leaving the methods to settle with an +approximation instead, which affects their performance in practice. Here, we +propose a MAP estimation framework to model the reverse conditional generation +process of a continuous time diffusion model as an optimization process of the +underlying MAP objective, whose gradient term is tractable. In theory, the +proposed framework can be applied to solve general inverse problems using +gradient-based optimization methods. However, given the highly non-convex +nature of the loss objective, finding a perfect gradient-based optimization +algorithm can be quite challenging, nevertheless, our framework offers several +potential research directions. We use our proposed formulation to develop +empirically effective algorithms for image restoration. We validate our +proposed algorithms with extensive experiments over multiple datasets across +several restoration tasks. + +
+
+
+
+
+ + ♻ ☆ PFDiff: Training-free Acceleration of Diffusion Models through the + Gradient Guidance of Past and Future + + +
+ Diffusion Probabilistic Models (DPMs) have shown remarkable potential in +image generation, but their sampling efficiency is hindered by the need for +numerous denoising steps. Most existing solutions accelerate the sampling +process by proposing fast ODE solvers. However, the inevitable discretization +errors of the ODE solvers are significantly magnified when the number of +function evaluations (NFE) is fewer. In this work, we propose PFDiff, a novel +training-free and orthogonal timestep-skipping strategy, which enables existing +fast ODE solvers to operate with fewer NFE. Specifically, PFDiff initially +utilizes gradient replacement from past time steps to predict a "springboard". +Subsequently, it employs this "springboard" along with foresight updates +inspired by Nesterov momentum to rapidly update current intermediate states. +This approach effectively reduces unnecessary NFE while correcting for +discretization errors inherent in first-order ODE solvers. Experimental results +demonstrate that PFDiff exhibits flexible applicability across various +pre-trained DPMs, particularly excelling in conditional DPMs and surpassing +previous state-of-the-art training-free methods. For instance, using DDIM as a +baseline, we achieved 16.46 FID (4 NFE) compared to 138.81 FID with DDIM on +ImageNet 64x64 with classifier guidance, and 13.06 FID (10 NFE) on Stable +Diffusion with 7.5 guidance scale. + +
+
+
+
+
+ + ♻ ☆ OneEncoder: A Lightweight Framework for Progressive Alignment of + Modalities + + +
+ Cross-modal alignment Learning integrates information from different +modalities like text, image, audio and video to create unified models. This +approach develops shared representations and learns correlations between +modalities, enabling applications such as visual question answering and +audiovisual content analysis. Current techniques rely on large +modality-specific encoders, necessitating fine-tuning or training from scratch +on vast aligned datasets (e.g., text-image, text-audio, image-audio). This +approach has limitations: (i) it is very expensive due to the need for training +large encoders on extensive datasets, (ii) acquiring aligned large paired +datasets is challenging, and (iii) adding new modalities requires retraining +the entire framework to incorporate these modalities. To address these issues, +we propose OneEncoder, a lightweight framework that progressively represents +and aligns four modalities (image, text, audio, video). Initially, we train a +lightweight Universal Projection module (UP) to align image and text +modalities. Then, we freeze the pretrained UP and progressively align future +modalities to those already aligned. OneEncoder operates efficiently and +cost-effectively, even in scenarios where vast aligned datasets are +unavailable, due to its lightweight design. Trained on small paired datasets, +it shows strong performance in tasks like classification, querying, and visual +question answering, surpassing methods that rely on large datasets and +specialized encoders. + +
+
+
+
+
+ + ♻ ☆ GDTS: Goal-Guided Diffusion Model with Tree Sampling for Multi-Modal + Pedestrian Trajectory Prediction ICRA 2025 + + +
+ Accurate prediction of pedestrian trajectories is crucial for improving the +safety of autonomous driving. However, this task is generally nontrivial due to +the inherent stochasticity of human motion, which naturally requires the +predictor to generate multi-modal prediction. Previous works leverage various +generative methods, such as GAN and VAE, for pedestrian trajectory prediction. +Nevertheless, these methods may suffer from mode collapse and relatively +low-quality results. The denoising diffusion probabilistic model (DDPM) has +recently been applied to trajectory prediction due to its simple training +process and powerful reconstruction ability. However, current diffusion-based +methods do not fully utilize input information and usually require many +denoising iterations that lead to a long inference time or an additional +network for initialization. To address these challenges and facilitate the use +of diffusion models in multi-modal trajectory prediction, we propose GDTS, a +novel Goal-Guided Diffusion Model with Tree Sampling for multi-modal trajectory +prediction. Considering the "goal-driven" characteristics of human motion, GDTS +leverages goal estimation to guide the generation of the diffusion network. A +two-stage tree sampling algorithm is presented, which leverages common features +to reduce the inference time and improve accuracy for multi-modal prediction. +Experimental results demonstrate that our proposed framework achieves +comparable state-of-the-art performance with real-time inference speed in +public datasets. + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ♻ ☆ Mamba-YOLO-World: Marrying YOLO-World with Mamba for Open-Vocabulary + Detection + + +
+ Open-vocabulary detection (OVD) aims to detect objects beyond a predefined +set of categories. As a pioneering model incorporating the YOLO series into +OVD, YOLO-World is well-suited for scenarios prioritizing speed and efficiency. +However, its performance is hindered by its neck feature fusion mechanism, +which causes the quadratic complexity and the limited guided receptive fields. +To address these limitations, we present Mamba-YOLO-World, a novel YOLO-based +OVD model employing the proposed MambaFusion Path Aggregation Network +(MambaFusion-PAN) as its neck architecture. Specifically, we introduce an +innovative State Space Model-based feature fusion mechanism consisting of a +Parallel-Guided Selective Scan algorithm and a Serial-Guided Selective Scan +algorithm with linear complexity and globally guided receptive fields. It +leverages multi-modal input sequences and mamba hidden states to guide the +selective scanning process. Experiments demonstrate that our model outperforms +the original YOLO-World on the COCO and LVIS benchmarks in both zero-shot and +fine-tuning settings while maintaining comparable parameters and FLOPs. +Additionally, it surpasses existing state-of-the-art OVD methods with fewer +parameters and FLOPs. + +
+
+
+
+
+ + ♻ ☆ LM-Gaussian: Boost Sparse-view 3D Gaussian Splatting with Large Model + Priors + + +
+ We aim to address sparse-view reconstruction of a 3D scene by leveraging +priors from large-scale vision models. While recent advancements such as 3D +Gaussian Splatting (3DGS) have demonstrated remarkable successes in 3D +reconstruction, these methods typically necessitate hundreds of input images +that densely capture the underlying scene, making them time-consuming and +impractical for real-world applications. However, sparse-view reconstruction is +inherently ill-posed and under-constrained, often resulting in inferior and +incomplete outcomes. This is due to issues such as failed initialization, +overfitting on input images, and a lack of details. To mitigate these +challenges, we introduce LM-Gaussian, a method capable of generating +high-quality reconstructions from a limited number of images. Specifically, we +propose a robust initialization module that leverages stereo priors to aid in +the recovery of camera poses and the reliable point clouds. Additionally, a +diffusion-based refinement is iteratively applied to incorporate image +diffusion priors into the Gaussian optimization process to preserve intricate +scene details. Finally, we utilize video diffusion priors to further enhance +the rendered images for realistic visual effects. Overall, our approach +significantly reduces the data acquisition requirements compared to previous +3DGS methods. We validate the effectiveness of our framework through +experiments on various public datasets, demonstrating its potential for +high-quality 360-degree scene reconstruction. Visual results are on our +website. + +
+
+ comment: Project page: https://hanyangyu1021.github.io/lm-gaussian.github.io/ +
+
+
+
+
+ + ♻ ☆ High-Resolution Building and Road Detection from Sentinel-2 + + +
+ Mapping buildings and roads automatically with remote sensing typically +requires high-resolution imagery, which is expensive to obtain and often +sparsely available. In this work we demonstrate how multiple 10 m resolution +Sentinel-2 images can be used to generate 50 cm resolution building and road +segmentation masks. This is done by training a `student' model with access to +Sentinel-2 images to reproduce the predictions of a `teacher' model which has +access to corresponding high-resolution imagery. While the predictions do not +have all the fine detail of the teacher model, we find that we are able to +retain much of the performance: for building segmentation we achieve 79.0\% +mIoU, compared to the high-resolution teacher model accuracy of 85.5\% mIoU. We +also describe two related methods that work on Sentinel-2 imagery: one for +counting individual buildings which achieves $R^2 = 0.91$ against true counts +and one for predicting building height with 1.5 meter mean absolute error. This +work opens up new possibilities for using freely available Sentinel-2 imagery +for a range of tasks that previously could only be done with high-resolution +satellite imagery. + +
+
+
+
+
+ + ♻ ☆ LTOS: Layout-controllable Text-Object Synthesis via Adaptive + Cross-attention Fusions + + +
+ Controllable text-to-image generation synthesizes visual text and objects in +images with certain conditions, which are frequently applied to emoji and +poster generation. Visual text rendering and layout-to-image generation tasks +have been popular in controllable text-to-image generation. However, each of +these tasks typically focuses on single modality generation or rendering, +leaving yet-to-be-bridged gaps between the approaches correspondingly designed +for each of the tasks. In this paper, we combine text rendering and +layout-to-image generation tasks into a single task: layout-controllable +text-object synthesis (LTOS) task, aiming at synthesizing images with object +and visual text based on predefined object layout and text contents. As +compliant datasets are not readily available for our LTOS task, we construct a +layout-aware text-object synthesis dataset, containing elaborate well-aligned +labels of visual text and object information. Based on the dataset, we propose +a layout-controllable text-object adaptive fusion (TOF) framework, which +generates images with clear, legible visual text and plausible objects. We +construct a visual-text rendering module to synthesize text and employ an +object-layout control module to generate objects while integrating the two +modules to harmoniously generate and integrate text content and objects in +images. To better the image-text integration, we propose a self-adaptive +cross-attention fusion module that helps the image generation to attend more to +important text information. Within such a fusion module, we use a self-adaptive +learnable factor to learn to flexibly control the influence of cross-attention +outputs on image generation. Experimental results show that our method +outperforms the state-of-the-art in LTOS, text rendering, and layout-to-image +tasks, enabling harmonious visual text rendering and object generation. + +
+
+
+
+
+ + ♻ ☆ QNCD: Quantization Noise Correction for Diffusion Models + + +
+ Diffusion models have revolutionized image synthesis, setting new benchmarks +in quality and creativity. However, their widespread adoption is hindered by +the intensive computation required during the iterative denoising process. +Post-training quantization (PTQ) presents a solution to accelerate sampling, +aibeit at the expense of sample quality, extremely in low-bit settings. +Addressing this, our study introduces a unified Quantization Noise Correction +Scheme (QNCD), aimed at minishing quantization noise throughout the sampling +process. We identify two primary quantization challenges: intra and inter +quantization noise. Intra quantization noise, mainly exacerbated by embeddings +in the resblock module, extends activation quantization ranges, increasing +disturbances in each single denosing step. Besides, inter quantization noise +stems from cumulative quantization deviations across the entire denoising +process, altering data distributions step-by-step. QNCD combats these through +embedding-derived feature smoothing for eliminating intra quantization noise +and an effective runtime noise estimatiation module for dynamicly filtering +inter quantization noise. Extensive experiments demonstrate that our method +outperforms previous quantization methods for diffusion models, achieving +lossless results in W4A8 and W8A8 quantization settings on ImageNet (LDM-4). +Code is available at: https://github.com/huanpengchu/QNCD + +
+
+ comment: Accepted by ACMMM2024 +
+
+
+
+
+ + ♻ ☆ High-Order Evolving Graphs for Enhanced Representation of Traffic + Dynamics ECCV + + +
+ We present an innovative framework for traffic dynamics analysis using +High-Order Evolving Graphs, designed to improve spatio-temporal representations +in autonomous driving contexts. Our approach constructs temporal bidirectional +bipartite graphs that effectively model the complex interactions within traffic +scenes in real-time. By integrating Graph Neural Networks (GNNs) with +high-order multi-aggregation strategies, we significantly enhance the modeling +of traffic scene dynamics, providing a more accurate and detailed analysis of +these interactions. Additionally, we incorporate inductive learning techniques +inspired by the GraphSAGE framework, enabling our model to adapt to new and +unseen traffic scenarios without the need for retraining, thus ensuring robust +generalization. Through extensive experiments on the ROAD and ROAD Waymo +datasets, we establish a comprehensive baseline for further developments, +demonstrating the potential of our method in accurately capturing traffic +behavior. Our results emphasize the value of high-order statistical moments and +feature-gated attention mechanisms in improving traffic behavior analysis, +laying the groundwork for advancing autonomous driving technologies. Our source +code is available at: https://github.com/Addy-1998/High_Order_Graphs + +
+
+ comment: Accepted manuscript - 2nd Workshop on Vision-Centric Autonomous + Driving (VCAD) as part of European Conference on Computer Vision (ECCV) 2024 +
+
+
+
+
+ + ♻ ☆ Annealed Winner-Takes-All for Motion Forecasting + + +
+ In autonomous driving, motion prediction aims at forecasting the future +trajectories of nearby agents, helping the ego vehicle to anticipate behaviors +and drive safely. A key challenge is generating a diverse set of future +predictions, commonly addressed using data-driven models with Multiple Choice +Learning (MCL) architectures and Winner-Takes-All (WTA) training objectives. +However, these methods face initialization sensitivity and training +instabilities. Additionally, to compensate for limited performance, some +approaches rely on training with a large set of hypotheses, requiring a +post-selection step during inference to significantly reduce the number of +predictions. To tackle these issues, we take inspiration from annealed MCL, a +recently introduced technique that improves the convergence properties of MCL +methods through an annealed Winner-Takes-All loss (aWTA). In this paper, we +demonstrate how the aWTA loss can be integrated with state-of-the-art motion +forecasting models to enhance their performance using only a minimal set of +hypotheses, eliminating the need for the cumbersome post-selection step. Our +approach can be easily incorporated into any trajectory prediction model +normally trained using WTA and yields significant improvements. To facilitate +the application of our approach to future motion forecasting models, the code +will be made publicly available upon acceptance: +https://github.com/valeoai/MF_aWTA. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Multiscale Feature Learning Using Co-Tuplet Loss for Offline Handwritten + Signature Verification + + +
+ Handwritten signature verification, crucial for legal and financial +institutions, faces challenges including inter-writer similarity, intra-writer +variations, and limited signature samples. To address these, we introduce the +MultiScale Signature feature learning Network (MS-SigNet) with the co-tuplet +loss, a novel metric learning loss designed for offline handwritten signature +verification. MS-SigNet learns both global and regional signature features from +multiple spatial scales, enhancing feature discrimination. This approach +effectively distinguishes genuine signatures from skilled forgeries by +capturing overall strokes and detailed local differences. The co-tuplet loss, +focusing on multiple positive and negative examples, overcomes the limitations +of typical metric learning losses by addressing inter-writer similarity and +intra-writer variations and emphasizing informative examples. We also present +HanSig, a large-scale Chinese signature dataset to support robust system +development for this language. The dataset is accessible at +\url{https://github.com/hsinmin/HanSig}. Experimental results on four benchmark +datasets in different languages demonstrate the promising performance of our +method in comparison to state-of-the-art approaches. + +
+
+
+
+
+ + ♻ ☆ LetsGo: Large-Scale Garage Modeling and Rendering via LiDAR-Assisted + Gaussian Primitives + + +
+ Large garages are ubiquitous yet intricate scenes that present unique +challenges due to their monotonous colors, repetitive patterns, reflective +surfaces, and transparent vehicle glass. Conventional Structure from Motion +(SfM) methods for camera pose estimation and 3D reconstruction often fail in +these environments due to poor correspondence construction. To address these +challenges, we introduce LetsGo, a LiDAR-assisted Gaussian splatting framework +for large-scale garage modeling and rendering. We develop a handheld scanner, +Polar, equipped with IMU, LiDAR, and a fisheye camera, to facilitate accurate +data acquisition. Using this Polar device, we present the GarageWorld dataset, +consisting of eight expansive garage scenes with diverse geometric structures, +which will be made publicly available for further research. Our approach +demonstrates that LiDAR point clouds collected by the Polar device +significantly enhance a suite of 3D Gaussian splatting algorithms for garage +scene modeling and rendering. We introduce a novel depth regularizer that +effectively eliminates floating artifacts in rendered images. Additionally, we +propose a multi-resolution 3D Gaussian representation designed for +Level-of-Detail (LOD) rendering. This includes adapted scaling factors for +individual levels and a random-resolution-level training scheme to optimize the +Gaussians across different resolutions. This representation enables efficient +rendering of large-scale garage scenes on lightweight devices via a web-based +renderer. Experimental results on our GarageWorld dataset, as well as on +ScanNet++ and KITTI-360, demonstrate the superiority of our method in terms of +rendering quality and resource efficiency. + +
+
+ comment: Project Page: https://zhaofuq.github.io/LetsGo/ +
+
+
+
+
+ + ♻ ☆ 3DGS-Calib: 3D Gaussian Splatting for Multimodal SpatioTemporal + Calibration IROS 2024 + + +
+ Reliable multimodal sensor fusion algorithms require accurate spatiotemporal +calibration. Recently, targetless calibration techniques based on implicit +neural representations have proven to provide precise and robust results. +Nevertheless, such methods are inherently slow to train given the high +computational overhead caused by the large number of sampled points required +for volume rendering. With the recent introduction of 3D Gaussian Splatting as +a faster alternative to implicit representation methods, we propose to leverage +this new rendering approach to achieve faster multi-sensor calibration. We +introduce 3DGS-Calib, a new calibration method that relies on the speed and +rendering accuracy of 3D Gaussian Splatting to achieve multimodal +spatiotemporal calibration that is accurate, robust, and with a substantial +speed-up compared to methods relying on implicit neural representations. We +demonstrate the superiority of our proposal with experimental results on +sequences from KITTI-360, a widely used driving dataset. + +
+
+ comment: Accepted at IROS 2024 (Oral presentation). Project page: + https://qherau.github.io/3DGS-Calib/ +
+
+
+
+
+ + ♻ ☆ V2I-Calib: A Novel Calibration Approach for Collaborative Vehicle and + Infrastructure LiDAR Systems IROS2024 + + +
+ Cooperative LiDAR systems integrating vehicles and road infrastructure, +termed V2I calibration, exhibit substantial potential, yet their deployment +encounters numerous challenges. A pivotal aspect of ensuring data accuracy and +consistency across such systems involves the calibration of LiDAR units across +heterogeneous vehicular and infrastructural endpoints. This necessitates the +development of calibration methods that are both real-time and robust, +particularly those that can ensure robust performance in urban canyon scenarios +without relying on initial positioning values. Accordingly, this paper +introduces a novel approach to V2I calibration, leveraging spatial association +information among perceived objects. Central to this method is the innovative +Overall Intersection over Union (oIoU) metric, which quantifies the correlation +between targets identified by vehicle and infrastructure systems, thereby +facilitating the real-time monitoring of calibration results. Our approach +involves identifying common targets within the perception results of vehicle +and infrastructure LiDAR systems through the construction of an affinity +matrix. These common targets then form the basis for the calculation and +optimization of extrinsic parameters. Comparative and ablation studies +conducted using the DAIR-V2X dataset substantiate the superiority of our +approach. For further insights and resources, our project repository is +accessible at https://github.com/MassimoQu/v2i-calib. + +
+
+ comment: IROS2024 +
+
+
+
+
+ + ♻ ☆ CoMT: Chain-of-Medical-Thought Reduces Hallucination in Medical Report + Generation + + +
+ Automatic medical report generation (MRG), which possesses significant +research value as it can aid radiologists in clinical diagnosis and report +composition, has garnered increasing attention. Despite recent progress, +generating accurate reports remains arduous due to the requirement for precise +clinical comprehension and disease diagnosis inference. Furthermore, owing to +the limited accessibility of medical data and the imbalanced distribution of +diseases, the underrepresentation of rare diseases in training data makes +large-scale medical visual language models (LVLMs) prone to hallucinations, +such as omissions or fabrications, severely undermining diagnostic performance +and further intensifying the challenges for MRG in practice. In this study, to +effectively mitigate hallucinations in medical report generation, we propose a +chain-of-medical-thought approach (CoMT), which intends to imitate the +cognitive process of human doctors by decomposing diagnostic procedures. The +radiological features with different importance are structured into +fine-grained medical thought chains to enhance the inferential ability during +diagnosis, thereby alleviating hallucination problems and enhancing the +diagnostic accuracy of MRG. All resources of this work will be released soon. + +
+
+
+
+
+ + ♻ ☆ Unsupervised Cross-domain Pulmonary Nodule Detection without Source Data + + +
+ Cross-domain pulmonary nodule detection suffers from performance degradation +due to a large shift of data distributions between the source and target +domain. Besides, considering the high cost of medical data annotation, it is +often assumed that the target images are unlabeled. Existing approaches have +made much progress for this unsupervised domain adaptation setting. However, +this setting is still rarely plausible in medical applications since the source +medical data are often not accessible due to privacy concerns. This motivates +us to propose a Source-free Unsupervised cross-domain method for Pulmonary +nodule detection (SUP), named Instance-level Contrastive Instruction +fine-tuning framework (ICI). It first adapts the source model to the target +domain by utilizing instance-level contrastive learning. Then the adapted model +is trained in a teacher-student interaction manner, and a weighted entropy loss +is incorporated to further improve the accuracy. We establish a benchmark by +adapting a pre-trained source model to three popular datasets for pulmonary +nodule detection. To the best of our knowledge, this represents the first +exploration of source-free unsupervised domain adaptation in medical image +object detection. Our extensive evaluations reveal that SUP-ICI substantially +surpasses existing state-of-the-art approaches, achieving FROC score +improvements ranging from 8.98% to 16.05%. This breakthrough not only sets a +new precedent for domain adaptation techniques in medical imaging but also +significantly advances the field toward overcoming challenges posed by data +privacy and availability. Code: https://github.com/Ruixxxx/SFUDA. + +
+
+
+
+
+ + ♻ ☆ Multi-modal Relation Distillation for Unified 3D Representation Learning ECCV2024 + + +
+ Recent advancements in multi-modal pre-training for 3D point clouds have +demonstrated promising results by aligning heterogeneous features across 3D +shapes and their corresponding 2D images and language descriptions. However, +current straightforward solutions often overlook intricate structural relations +among samples, potentially limiting the full capabilities of multi-modal +learning. To address this issue, we introduce Multi-modal Relation Distillation +(MRD), a tri-modal pre-training framework, which is designed to effectively +distill reputable large Vision-Language Models (VLM) into 3D backbones. MRD +aims to capture both intra-relations within each modality as well as +cross-relations between different modalities and produce more discriminative 3D +shape representations. Notably, MRD achieves significant improvements in +downstream zero-shot classification tasks and cross-modality retrieval tasks, +delivering new state-of-the-art performance. + +
+
+ comment: Accepted by ECCV2024 +
+
+
+
+
+ + ♻ ☆ DreamMover: Leveraging the Prior of Diffusion Models for Image + Interpolation with Large Motion ECCV 2024 + + +
+ We study the problem of generating intermediate images from image pairs with +large motion while maintaining semantic consistency. Due to the large motion, +the intermediate semantic information may be absent in input images. Existing +methods either limit to small motion or focus on topologically similar objects, +leading to artifacts and inconsistency in the interpolation results. To +overcome this challenge, we delve into pre-trained image diffusion models for +their capabilities in semantic cognition and representations, ensuring +consistent expression of the absent intermediate semantic representations with +the input. To this end, we propose DreamMover, a novel image interpolation +framework with three main components: 1) A natural flow estimator based on the +diffusion model that can implicitly reason about the semantic correspondence +between two images. 2) To avoid the loss of detailed information during fusion, +our key insight is to fuse information in two parts, high-level space and +low-level space. 3) To enhance the consistency between the generated images and +input, we propose the self-attention concatenation and replacement approach. +Lastly, we present a challenging benchmark dataset InterpBench to evaluate the +semantic consistency of generated results. Extensive experiments demonstrate +the effectiveness of our method. Our project is available at +https://dreamm0ver.github.io . + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ HENet: Hybrid Encoding for End-to-end Multi-task 3D Perception from + Multi-view Cameras ECCV 2024 + + +
+ Three-dimensional perception from multi-view cameras is a crucial component +in autonomous driving systems, which involves multiple tasks like 3D object +detection and bird's-eye-view (BEV) semantic segmentation. To improve +perception precision, large image encoders, high-resolution images, and +long-term temporal inputs have been adopted in recent 3D perception models, +bringing remarkable performance gains. However, these techniques are often +incompatible in training and inference scenarios due to computational resource +constraints. Besides, modern autonomous driving systems prefer to adopt an +end-to-end framework for multi-task 3D perception, which can simplify the +overall system architecture and reduce the implementation complexity. However, +conflict between tasks often arises when optimizing multiple tasks jointly +within an end-to-end 3D perception model. To alleviate these issues, we present +an end-to-end framework named HENet for multi-task 3D perception in this paper. +Specifically, we propose a hybrid image encoding network, using a large image +encoder for short-term frames and a small image encoder for long-term temporal +frames. Then, we introduce a temporal feature integration module based on the +attention mechanism to fuse the features of different frames extracted by the +two aforementioned hybrid image encoders. Finally, according to the +characteristics of each perception task, we utilize BEV features of different +grid sizes, independent BEV encoders, and task decoders for different tasks. +Experimental results show that HENet achieves state-of-the-art end-to-end +multi-task 3D perception results on the nuScenes benchmark, including 3D object +detection and BEV semantic segmentation. The source code and models will be +released at https://github.com/VDIGPKU/HENet. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ♻ ☆ 3D Reconstruction with Fast Dipole Sums + + +
+ We introduce a method for high-quality 3D reconstruction from multi-view +images. Our method uses a new point-based representation, the regularized +dipole sum, which generalizes the winding number to allow for interpolation of +per-point attributes in point clouds with noisy or outlier points. Using +regularized dipole sums, we represent implicit geometry and radiance fields as +per-point attributes of a dense point cloud, which we initialize from structure +from motion. We additionally derive Barnes-Hut fast summation schemes for +accelerated forward and adjoint dipole sum queries. These queries facilitate +the use of ray tracing to efficiently and differentiably render images with our +point-based representations, and thus update their point attributes to optimize +scene geometry and appearance. We evaluate our method in inverse rendering +applications against state-of-the-art alternatives, based on ray tracing of +neural representations or rasterization of Gaussian point-based +representations. Our method significantly improves 3D reconstruction quality +and robustness at equal runtimes, while also supporting more general rendering +methods such as shadow rays for direct illumination. + +
+
+ comment: project page: https://imaging.cs.cmu.edu/fast_dipole_sums +
+
+
+
+
+ + ♻ ☆ Tracking-Assisted Object Detection with Event Cameras + + +
+ Event-based object detection has recently garnered attention in the computer +vision community due to the exceptional properties of event cameras, such as +high dynamic range and no motion blur. However, feature asynchronism and +sparsity cause invisible objects due to no relative motion to the camera, +posing a significant challenge in the task. Prior works have studied various +implicit-learned memories to retain as many temporal cues as possible. However, +implicit memories still struggle to preserve long-term features effectively. In +this paper, we consider those invisible objects as pseudo-occluded objects and +aim to detect them by tracking through occlusions. Firstly, we introduce the +visibility attribute of objects and contribute an auto-labeling algorithm to +not only clean the existing event camera dataset but also append additional +visibility labels to it. Secondly, we exploit tracking strategies for +pseudo-occluded objects to maintain their permanence and retain their bounding +boxes, even when features have not been available for a very long time. These +strategies can be treated as an explicit-learned memory guided by the tracking +objective to record the displacements of objects across frames. Lastly, we +propose a spatio-temporal feature aggregation module to enrich the latent +features and a consistency loss to increase the robustness of the overall +pipeline. We conduct comprehensive experiments to verify our method's +effectiveness where still objects are retained, but real occluded objects are +discarded. The results demonstrate that (1) the additional visibility labels +can assist in supervised training, and (2) our method outperforms +state-of-the-art approaches with a significant improvement of 7.9% absolute +mAP. + +
+
+
+
+
+ + ♻ ☆ FAIntbench: A Holistic and Precise Benchmark for Bias Evaluation in + Text-to-Image Models + + +
+ The rapid development and reduced barriers to entry for Text-to-Image (T2I) +models have raised concerns about the biases in their outputs, but existing +research lacks a holistic definition and evaluation framework of biases, +limiting the enhancement of debiasing techniques. To address this issue, we +introduce FAIntbench, a holistic and precise benchmark for biases in T2I +models. In contrast to existing benchmarks that evaluate bias in limited +aspects, FAIntbench evaluate biases from four dimensions: manifestation of +bias, visibility of bias, acquired attributes, and protected attributes. We +applied FAIntbench to evaluate seven recent large-scale T2I models and +conducted human evaluation, whose results demonstrated the effectiveness of +FAIntbench in identifying various biases. Our study also revealed new research +questions about biases, including the side-effect of distillation. The findings +presented here are preliminary, highlighting the potential of FAIntbench to +advance future research aimed at mitigating the biases in T2I models. Our +benchmark is publicly available to ensure the reproducibility. + +
+
+
+
+
+ + ♻ ☆ EvaNet: Elevation-Guided Flood Extent Mapping on Earth Imagery (Extended + Version) IJCAI + + +
+ Accurate and timely mapping of flood extent from high-resolution satellite +imagery plays a crucial role in disaster management such as damage assessment +and relief activities. However, current state-of-the-art solutions are based on +U-Net, which can-not segment the flood pixels accurately due to the ambiguous +pixels (e.g., tree canopies, clouds) that prevent a direct judgement from only +the spectral features. Thanks to the digital elevation model (DEM) data readily +available from sources such as United States Geological Survey (USGS), this +work explores the use of an elevation map to improve flood extent mapping. We +propose, EvaNet, an elevation-guided segmentation model based on the +encoder-decoder architecture with two novel techniques: (1) a loss function +encoding the physical law of gravity that if a location is flooded (resp. dry), +then its adjacent locations with a lower (resp. higher) elevation must also be +flooded (resp. dry); (2) a new (de)convolution operation that integrates the +elevation map by a location sensitive gating mechanism to regulate how much +spectral features flow through adjacent layers. Extensive experiments show that +EvaNet significantly outperforms the U-Net baselines, and works as a perfect +drop-in replacement for U-Net in existing solutions to flood extent mapping. + +
+
+ comment: Published at the International Joint Conference on Artificial + Intelligence (IJCAI, 2024) +
+
+
+
+
+ + ♻ ☆ Mitral Regurgitation Recogniton based on Unsupervised + Out-of-Distribution Detection with Residual Diffusion Amplification MICCAI + + +
+ Mitral regurgitation (MR) is a serious heart valve disease. Early and +accurate diagnosis of MR via ultrasound video is critical for timely clinical +decision-making and surgical intervention. However, manual MR diagnosis heavily +relies on the operator's experience, which may cause misdiagnosis and +inter-observer variability. Since MR data is limited and has large intra-class +variability, we propose an unsupervised out-of-distribution (OOD) detection +method to identify MR rather than building a deep classifier. To our knowledge, +we are the first to explore OOD in MR ultrasound videos. Our method consists of +a feature extractor, a feature reconstruction model, and a residual +accumulation amplification algorithm. The feature extractor obtains features +from the video clips and feeds them into the feature reconstruction model to +restore the original features. The residual accumulation amplification +algorithm then iteratively performs noise feature reconstruction, amplifying +the reconstructed error of OOD features. This algorithm is straightforward yet +efficient and can seamlessly integrate as a plug-and-play component in +reconstruction-based OOD detection methods. We validated the proposed method on +a large ultrasound dataset containing 893 non-MR and 267 MR videos. +Experimental results show that our OOD detection method can effectively +identify MR samples. + +
+
+ comment: Accepted by MICCAI MLMI 2024, 11 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Adaptive Semantic Consistency for Cross-domain Few-shot Classification + + +
+ Cross-domain few-shot classification (CD-FSC) aims to identify novel target +classes with a few samples, assuming that there exists a domain shift between +source and target domains. Existing state-of-the-art practices typically +pre-train on source domain and then finetune on the few-shot target data to +yield task-adaptive representations. Despite promising progress, these methods +are prone to overfitting the limited target distribution since data-scarcity +and ignore the transferable knowledge learned in the source domain. To +alleviate this problem, we propose a simple plug-and-play Adaptive Semantic +Consistency (ASC) framework, which improves cross-domain robustness by +preserving source transfer capability during the finetuning stage. Concretely, +we reuse the source images in the pretraining phase and design an adaptive +weight assignment strategy to highlight the samples similar to target domain, +aiming to aggregate informative target-related knowledge from source domain. +Subsequently, a semantic consistency regularization is applied to constrain the +consistency between the semantic features of the source images output by the +source model and target model. In this way, the proposed ASC enables explicit +transfer of source domain knowledge to prevent the model from overfitting the +target domain. Extensive experiments on multiple benchmarks demonstrate the +effectiveness of the proposed ASC, and ASC provides consistent improvements +over the baselines. The source code is released at +https://github.com/luhc666/ASC-CDFSL. + +
+
+
+
+
+ + ♻ ☆ Residual Back Projection With Untrained Neural Networks + + +
+ Background and Objective: The success of neural networks in a number of image +processing tasks has motivated their application in image reconstruction +problems in computed tomography (CT). While progress has been made in this +area, the lack of stability and theoretical guarantees for accuracy, together +with the scarcity of high-quality training data for specific imaging domains +pose challenges for many CT applications. In this paper, we present a framework +for iterative reconstruction (IR) in CT that leverages the hierarchical +structure of neural networks, without the need for training. Our framework +incorporates this structural information as a deep image prior (DIP), and uses +a novel residual back projection (RBP) connection that forms the basis for our +iterations. + Methods: We propose using an untrained U-net in conjunction with a novel +residual back projection to minimize an objective function and achieve +high-accuracy reconstruction. In each iteration, the weights of the untrained +U-net are optimized, and the output of the U-net in the current iteration is +used to update the input of the U-net in the next iteration through the +aforementioned RBP connection. + Results: Experimental results demonstrate that the RBP-DIP framework offers +improvements over other state-of-the-art conventional IR methods, as well as +pre-trained and untrained models with similar network structures under multiple +conditions. These improvements are particularly significant in the few-view, +limited-angle, and low-dose imaging configurations. + Conclusions: Applying to both parallel and fan beam X-ray imaging, our +framework shows significant improvement under multiple conditions. Furthermore, +the proposed framework requires no training data and can be adjusted on-demand +to adapt to different conditions (e.g. noise level, geometry, and imaged +object). + +
+
+
+
+
+ + ♻ ☆ Official-NV: An LLM-Generated News Video Dataset for Multimodal Fake + News Detection + + +
+ News media, especially video news media, have penetrated into every aspect of +daily life, which also brings the risk of fake news. Therefore, multimodal fake +news detection has recently garnered increased attention. However, the existing +datasets are comprised of user-uploaded videos and contain an excess amounts of +superfluous data, which introduces noise into the model training process. To +address this issue, we construct a dataset named Official-NV, comprising +officially published news videos. The crawl officially published videos are +augmented through the use of LLMs-based generation and manual verification, +thereby expanding the dataset. Furthermore, the proposed dataset is benchmarked +against several baselines to demonstrate its effectiveness in multimodal news +detection. + +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ Generalized compression and compressive search of large datasets + + +
+ The Big Data explosion has necessitated the development of search algorithms +that scale sub-linearly in time and memory. + While compression algorithms and search algorithms do exist independently, +few algorithms offer both, and those which do are domain-specific. + We present panCAKES, a novel approach to compressive search, i.e., a way to +perform $k$-NN and $\rho$-NN search on compressed data while only decompressing +a small, relevant, portion of the data. + panCAKES assumes the manifold hypothesis and leverages the low-dimensional +structure of the data to compress and search it efficiently. + panCAKES is generic over any distance function for which the distance between +two points is proportional to the memory cost of storing an encoding of one in +terms of the other. + This property holds for many widely-used distance functions, e.g. string edit +distances (Levenshtein, Needleman-Wunsch, etc.) and set dissimilarity measures +(Jaccard, Dice, etc.). + We benchmark panCAKES on a variety of datasets, including genomic, proteomic, +and set data. + We compare compression ratios to gzip, and search performance between the +compressed and uncompressed versions of the same dataset. + panCAKES achieves compression ratios close to those of gzip, while offering +sub-linear time performance for $k$-NN and $\rho$-NN search. + We conclude that panCAKES is an efficient, general-purpose algorithm for +exact compressive search on large datasets that obey the manifold hypothesis. + We provide an open-source implementation of panCAKES in the Rust programming +language. + +
+
+
+
+
+ + ☆ Decoding Style: Efficient Fine-Tuning of LLMs for Image-Guided Outfit + Recommendation with Preference CIKM 2024 + + +
+ Personalized outfit recommendation remains a complex challenge, demanding +both fashion compatibility understanding and trend awareness. This paper +presents a novel framework that harnesses the expressive power of large +language models (LLMs) for this task, mitigating their "black box" and static +nature through fine-tuning and direct feedback integration. We bridge the item +visual-textual gap in items descriptions by employing image captioning with a +Multimodal Large Language Model (MLLM). This enables the LLM to extract style +and color characteristics from human-curated fashion images, forming the basis +for personalized recommendations. The LLM is efficiently fine-tuned on the +open-source Polyvore dataset of curated fashion images, optimizing its ability +to recommend stylish outfits. A direct preference mechanism using negative +examples is employed to enhance the LLM's decision-making process. This creates +a self-enhancing AI feedback loop that continuously refines recommendations in +line with seasonal fashion trends. Our framework is evaluated on the Polyvore +dataset, demonstrating its effectiveness in two key tasks: fill-in-the-blank, +and complementary item retrieval. These evaluations underline the framework's +ability to generate stylish, trend-aligned outfit suggestions, continuously +improving through direct feedback. The evaluation results demonstrated that our +proposed framework significantly outperforms the base LLM, creating more +cohesive outfits. The improved performance in these tasks underscores the +proposed framework's potential to enhance the shopping experience with accurate +suggestions, proving its effectiveness over the vanilla LLM based outfit +generation. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ☆ Understanding the Effects of the Baidu-ULTR Logging Policy on Two-Tower + Models RecSys + '24 + + +
+ Despite the popularity of the two-tower model for unbiased learning to rank +(ULTR) tasks, recent work suggests that it suffers from a major limitation that +could lead to its collapse in industry applications: the problem of logging +policy confounding. Several potential solutions have even been proposed; +however, the evaluation of these methods was mostly conducted using +semi-synthetic simulation experiments. This paper bridges the gap between +theory and practice by investigating the confounding problem on the largest +real-world dataset, Baidu-ULTR. Our main contributions are threefold: 1) we +show that the conditions for the confounding problem are given on Baidu-ULTR, +2) the confounding problem bears no significant effect on the two-tower model, +and 3) we point to a potential mismatch between expert annotations, the golden +standard in ULTR, and user click behavior. + +
+
+ comment: Accepted at the CONSEQUENCES '24 workshop, co-located with ACM RecSys + '24 +
+
+
+
+
+ + ☆ AlignBot: Aligning VLM-powered Customized Task Planning with User + Reminders Through Fine-Tuning for Household Robots + + +
+ This paper presents AlignBot, a novel framework designed to optimize +VLM-powered customized task planning for household robots by effectively +aligning with user reminders. In domestic settings, aligning task planning with +user reminders poses significant challenges due to the limited quantity, +diversity, and multimodal nature of the reminders. To address these challenges, +AlignBot employs a fine-tuned LLaVA-7B model, functioning as an adapter for +GPT-4o. This adapter model internalizes diverse forms of user reminders-such as +personalized preferences, corrective guidance, and contextual assistance-into +structured instruction-formatted cues that prompt GPT-4o in generating +customized task plans. Additionally, AlignBot integrates a dynamic retrieval +mechanism that selects task-relevant historical successes as prompts for +GPT-4o, further enhancing task planning accuracy. To validate the effectiveness +of AlignBot, experiments are conducted in real-world household environments, +which are constructed within the laboratory to replicate typical household +settings. A multimodal dataset with over 1,500 entries derived from volunteer +reminders is used for training and evaluation. The results demonstrate that +AlignBot significantly improves customized task planning, outperforming +existing LLM- and VLM-powered planners by interpreting and aligning with user +reminders, achieving 86.8% success rate compared to the vanilla GPT-4o baseline +at 21.6%, reflecting a 65% improvement and over four times greater +effectiveness. Supplementary materials are available at: +https://yding25.com/AlignBot/ + +
+
+
+
+
+ + ☆ Retrieve, Annotate, Evaluate, Repeat: Leveraging Multimodal LLMs for + Large-Scale Product Retrieval Evaluation + + +
+ Evaluating production-level retrieval systems at scale is a crucial yet +challenging task due to the limited availability of a large pool of +well-trained human annotators. Large Language Models (LLMs) have the potential +to address this scaling issue and offer a viable alternative to humans for the +bulk of annotation tasks. In this paper, we propose a framework for assessing +the product search engines in a large-scale e-commerce setting, leveraging +Multimodal LLMs for (i) generating tailored annotation guidelines for +individual queries, and (ii) conducting the subsequent annotation task. Our +method, validated through deployment on a large e-commerce platform, +demonstrates comparable quality to human annotations, significantly reduces +time and cost, facilitates rapid problem discovery, and provides an effective +solution for production-level quality control at scale. + +
+
+ comment: 13 pages, 5 figures, 4 Tables +
+
+
+
+
+ + ☆ The Factuality of Large Language Models in the Legal Domain CIKM 2024 + + +
+ This paper investigates the factuality of large language models (LLMs) as +knowledge bases in the legal domain, in a realistic usage scenario: we allow +for acceptable variations in the answer, and let the model abstain from +answering when uncertain. First, we design a dataset of diverse factual +questions about case law and legislation. We then use the dataset to evaluate +several LLMs under different evaluation methods, including exact, alias, and +fuzzy matching. Our results show that the performance improves significantly +under the alias and fuzzy matching methods. Further, we explore the impact of +abstaining and in-context examples, finding that both strategies enhance +precision. Finally, we demonstrate that additional pre-training on legal +documents, as seen with SaulLM, further improves factual precision from 63% to +81%. + +
+
+ comment: CIKM 2024, short paper +
+
+
+
+
+ + ☆ Active Reconfigurable Intelligent Surface Empowered Synthetic Aperture + Radar Imaging + + +
+ Synthetic Aperture Radar (SAR) utilizes the movement of the radar antenna +over a specific area of interest to achieve higher spatial resolution imaging. +In this paper, we aim to investigate the realization of SAR imaging for a +stationary radar system with the assistance of active reconfigurable +intelligent surface (ARIS) mounted on an unmanned aerial vehicle (UAV). As the +UAV moves along the stationary trajectory, the ARIS can not only build a +high-quality virtual line-of-sight (LoS) propagation path, but its mobility can +also effectively create a much larger virtual aperture, which can be utilized +to realize a SAR system. In this paper, we first present a range-Doppler (RD) +imaging algorithm to obtain imaging results for the proposed ARIS-empowered SAR +system. Then, to further improve the SAR imaging performance, we attempt to +optimize the reflection coefficients of ARIS to maximize the signal-to-noise +ratio (SNR) at the stationary radar receiver under the constraints of ARIS +maximum power and amplification factor. An effective algorithm based on +fractional programming (FP) and majorization minimization (MM) methods is +developed to solve the resulting non-convex problem. Simulation results +validate the effectiveness of ARIS-assisted SAR imaging and our proposed RD +imaging and ARIS optimization algorithms. + +
+
+
+
+
+ + ☆ FLARE: Fusing Language Models and Collaborative Architectures for + Recommender Enhancement + + +
+ Hybrid recommender systems, combining item IDs and textual descriptions, +offer potential for improved accuracy. However, previous work has largely +focused on smaller datasets and model architectures. This paper introduces +Flare (Fusing Language models and collaborative Architectures for Recommender +Enhancement), a novel hybrid recommender that integrates a language model (mT5) +with a collaborative filtering model (Bert4Rec) using a Perceiver network. This +architecture allows Flare to effectively combine collaborative and content +information for enhanced recommendations. + We conduct a two-stage evaluation, first assessing Flare's performance +against established baselines on smaller datasets, where it demonstrates +competitive accuracy. Subsequently, we evaluate Flare on a larger, more +realistic dataset with a significantly larger item vocabulary, introducing new +baselines for this setting. Finally, we showcase Flare's inherent ability to +support critiquing, enabling users to provide feedback and refine +recommendations. We further leverage critiquing as an evaluation method to +assess the model's language understanding and its transferability to the +recommendation task. + +
+
+
+
+
+ + ☆ Basket-Enhanced Heterogenous Hypergraph for Price-Sensitive Next Basket + Recommendation + + +
+ Next Basket Recommendation (NBR) is a new type of recommender system that +predicts combinations of items users are likely to purchase together. Existing +NBR models often overlook a crucial factor, which is price, and do not fully +capture item-basket-user interactions. To address these limitations, we propose +a novel method called Basket-augmented Dynamic Heterogeneous Hypergraph (BDHH). +BDHH utilizes a heterogeneous multi-relational graph to capture the intricate +relationships among item features, with price as a critical factor. Moreover, +our approach includes a basket-guided dynamic augmentation network that could +dynamically enhances item-basket-user interactions. Experiments on real-world +datasets demonstrate that BDHH significantly improves recommendation accuracy, +providing a more comprehensive understanding of user behavior. + +
+
+
+
+
+ + ☆ An Enhanced-State Reinforcement Learning Algorithm for Multi-Task Fusion + in Large-Scale Recommender Systems + + +
+ As the last key stage of Recommender Systems (RSs), Multi-Task Fusion (MTF) +is in charge of combining multiple scores predicted by Multi-Task Learning +(MTL) into a final score to maximize user satisfaction, which decides the +ultimate recommendation results. In recent years, to maximize long-term user +satisfaction within a recommendation session, Reinforcement Learning (RL) is +widely used for MTF in large-scale RSs. However, limited by their modeling +pattern, all the current RL-MTF methods can only utilize user features as the +state to generate actions for each user, but unable to make use of item +features and other valuable features, which leads to suboptimal results. +Addressing this problem is a challenge that requires breaking through the +current modeling pattern of RL-MTF. To solve this problem, we propose a novel +method called Enhanced-State RL for MTF in RSs. Unlike the existing methods +mentioned above, our method first defines user features, item features, and +other valuable features collectively as the enhanced state; then proposes a +novel actor and critic learning process to utilize the enhanced state to make +much better action for each user-item pair. To the best of our knowledge, this +novel modeling pattern is being proposed for the first time in the field of +RL-MTF. We conduct extensive offline and online experiments in a large-scale +RS. The results demonstrate that our model outperforms other models +significantly. Enhanced-State RL has been fully deployed in our RS more than +half a year, improving +3.84% user valid consumption and +0.58% user duration +time compared to baseline. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2404.17589 +
+
+
+
+
+ + ☆ Designing Interfaces for Multimodal Vector Search Applications CIKM 2024 + + +
+ Multimodal vector search offers a new paradigm for information retrieval by +exposing numerous pieces of functionality which are not possible in traditional +lexical search engines. While multimodal vector search can be treated as a drop +in replacement for these traditional systems, the experience can be +significantly enhanced by leveraging the unique capabilities of multimodal +search. Central to any information retrieval system is a user who expresses an +information need, traditional user interfaces with a single search bar allow +users to interact with lexical search systems effectively however are not +necessarily optimal for multimodal vector search. In this paper we explore +novel capabilities of multimodal vector search applications utilising CLIP +models and present implementations and design patterns which better allow users +to express their information needs and effectively interact with these systems +in an information retrieval context. + +
+
+ comment: 12 pages, 8 figures, CIKM 2024 MMSR Workshop +
+
+
+
+
+ + ♻ ☆ Language Models and Retrieval Augmented Generation for Automated + Structured Data Extraction from Diagnostic Reports + + +
+ Purpose: To develop and evaluate an automated system for extracting +structured clinical information from unstructured radiology and pathology +reports using open-weights large language models (LMs) and retrieval augmented +generation (RAG), and to assess the effects of model configuration variables on +extraction performance. Methods and Materials: The study utilized two datasets: +7,294 radiology reports annotated for Brain Tumor Reporting and Data System +(BT-RADS) scores and 2,154 pathology reports annotated for isocitrate +dehydrogenase (IDH) mutation status. An automated pipeline was developed to +benchmark the performance of various LMs and RAG configurations. The impact of +model size, quantization, prompting strategies, output formatting, and +inference parameters was systematically evaluated. Results: The best performing +models achieved over 98% accuracy in extracting BT-RADS scores from radiology +reports and over 90% for IDH mutation status extraction from pathology reports. +The top model being medical fine-tuned llama3. Larger, newer, and domain +fine-tuned models consistently outperformed older and smaller models. Model +quantization had minimal impact on performance. Few-shot prompting +significantly improved accuracy. RAG improved performance for complex pathology +reports but not for shorter radiology reports. Conclusions: Open LMs +demonstrate significant potential for automated extraction of structured +clinical data from unstructured clinical reports with local privacy-preserving +application. Careful model selection, prompt engineering, and semi-automated +optimization using annotated data are critical for optimal performance. These +approaches could be reliable enough for practical use in research workflows, +highlighting the potential for human-machine collaboration in healthcare data +extraction. + +
+
+
+
+
+ + ♻ ☆ AugTriever: Unsupervised Dense Retrieval by Scalable Data Augmentation + + +
+ Dense retrievers have made significant strides in text retrieval and +open-domain question answering. However, most of these achievements have relied +heavily on extensive human-annotated supervision. In this study, we aim to +develop unsupervised methods for improving dense retrieval models. We propose +two approaches that enable annotation-free and scalable training by creating +pseudo querydocument pairs: query extraction and transferred query generation. +The query extraction method involves selecting salient spans from the original +document to generate pseudo queries. On the other hand, the transferred query +generation method utilizes generation models trained for other NLP tasks, such +as summarization, to produce pseudo queries. Through extensive experimentation, +we demonstrate that models trained using these augmentation methods can achieve +comparable, if not better, performance than multiple strong dense baselines. +Moreover, combining these strategies leads to further improvements, resulting +in superior performance of unsupervised dense retrieval, unsupervised domain +adaptation and supervised finetuning, benchmarked on both BEIR and ODQA +datasets. Code and datasets are publicly available at +https://github.com/salesforce/AugTriever. + +
+
+ comment: DCAI24, October 25, 2024, Boise, ID +
+
+
+
+
+ + ♻ ☆ A Best-of-Both Approach to Improve Match Predictions and Reciprocal + Recommendations for Job Search + + +
+ Matching users with mutual preferences is a critical aspect of services +driven by reciprocal recommendations, such as job search. To produce +recommendations in such scenarios, one can predict match probabilities and +construct rankings based on these predictions. However, this direct match +prediction approach often underperforms due to the extreme sparsity of match +labels. Therefore, most existing methods predict preferences separately for +each direction (e.g., job seeker to employer and employer to job seeker) and +then aggregate the predictions to generate overall matching scores and produce +recommendations. However, this typical approach often leads to practical +issues, such as biased error propagation between the two models. This paper +introduces and demonstrates a novel and practical solution to improve +reciprocal recommendations in production by leveraging pseudo-match scores. +Specifically, our approach generates dense and more directly relevant +pseudo-match scores by combining the true match labels, which are accurate but +sparse, with relatively inaccurate but dense match predictions. We then train a +meta-model to output the final match predictions by minimizing the prediction +loss against the pseudo-match scores. Our method can be seen as a best-of-both +(BoB) approach, as it combines the high-level ideas of both direct match +prediction and the two separate models approach. It also allows for +user-specific weights to construct personalized pseudo-match scores, achieving +even better matching performance through appropriate tuning of the weights. +Offline experiments on real-world job search data demonstrate the superior +performance of our BoB method, particularly with personalized pseudo-match +scores, compared to existing approaches in terms of finding potential matches. + +
+
+
+
+
+
+
+
+ + Machine Learning 122 + +
+
+
+ + ☆ DynaMo: In-Domain Dynamics Pretraining for Visuo-Motor Control + + +
+ Imitation learning has proven to be a powerful tool for training complex +visuomotor policies. However, current methods often require hundreds to +thousands of expert demonstrations to handle high-dimensional visual +observations. A key reason for this poor data efficiency is that visual +representations are predominantly either pretrained on out-of-domain data or +trained directly through a behavior cloning objective. In this work, we present +DynaMo, a new in-domain, self-supervised method for learning visual +representations. Given a set of expert demonstrations, we jointly learn a +latent inverse dynamics model and a forward dynamics model over a sequence of +image embeddings, predicting the next frame in latent space, without +augmentations, contrastive sampling, or access to ground truth actions. +Importantly, DynaMo does not require any out-of-domain data such as Internet +datasets or cross-embodied datasets. On a suite of six simulated and real +environments, we show that representations learned with DynaMo significantly +improve downstream imitation learning performance over prior self-supervised +learning objectives, and pretrained representations. Gains from using DynaMo +hold across policy classes such as Behavior Transformer, Diffusion Policy, MLP, +and nearest neighbors. Finally, we ablate over key components of DynaMo and +measure its impact on downstream policy performance. Robot videos are best +viewed at https://dynamo-ssl.github.io + +
+
+
+
+
+ + ☆ Massively Multi-Person 3D Human Motion Forecasting with Scene Context + + +
+ Forecasting long-term 3D human motion is challenging: the stochasticity of +human behavior makes it hard to generate realistic human motion from the input +sequence alone. Information on the scene environment and the motion of nearby +people can greatly aid the generation process. We propose a scene-aware social +transformer model (SAST) to forecast long-term (10s) human motion motion. +Unlike previous models, our approach can model interactions between both widely +varying numbers of people and objects in a scene. We combine a temporal +convolutional encoder-decoder architecture with a Transformer-based bottleneck +that allows us to efficiently combine motion and scene information. We model +the conditional motion distribution using denoising diffusion models. We +benchmark our approach on the Humans in Kitchens dataset, which contains 1 to +16 persons and 29 to 50 objects that are visible simultaneously. Our model +outperforms other approaches in terms of realism and diversity on different +metrics and in a user study. Code is available at +https://github.com/felixbmuller/SAST. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ To CoT or not to CoT? Chain-of-thought helps mainly on math and symbolic + reasoning + + +
+ Chain-of-thought (CoT) via prompting is the de facto method for eliciting +reasoning capabilities from large language models (LLMs). But for what kinds of +tasks is this extra ``thinking'' really helpful? To analyze this, we conducted +a quantitative meta-analysis covering over 100 papers using CoT and ran our own +evaluations of 20 datasets across 14 models. Our results show that CoT gives +strong performance benefits primarily on tasks involving math or logic, with +much smaller gains on other types of tasks. On MMLU, directly generating the +answer without CoT leads to almost identical accuracy as CoT unless the +question or model's response contains an equals sign, indicating symbolic +operations and reasoning. Following this finding, we analyze the behavior of +CoT on these problems by separating planning and execution and comparing +against tool-augmented LLMs. Much of CoT's gain comes from improving symbolic +execution, but it underperforms relative to using a symbolic solver. Our +results indicate that CoT can be applied selectively, maintaining performance +while saving inference costs. Furthermore, they suggest a need to move beyond +prompt-based CoT to new paradigms that better leverage intermediate computation +across the whole range of LLM applications. + +
+
+
+
+
+ + ☆ Finetuning Language Models to Emit Linguistic Expressions of Uncertainty + + +
+ Large language models (LLMs) are increasingly employed in information-seeking +and decision-making tasks. Despite their broad utility, LLMs tend to generate +information that conflicts with real-world facts, and their persuasive style +can make these inaccuracies appear confident and convincing. As a result, +end-users struggle to consistently align the confidence expressed by LLMs with +the accuracy of their predictions, often leading to either blind trust in all +outputs or a complete disregard for their reliability. In this work, we explore +supervised finetuning on uncertainty-augmented predictions as a method to +develop models that produce linguistic expressions of uncertainty. +Specifically, we measure the calibration of pre-trained models and then +fine-tune language models to generate calibrated linguistic expressions of +uncertainty. Through experiments on various question-answering datasets, we +demonstrate that LLMs are well-calibrated in assessing their predictions, and +supervised finetuning based on the model's own confidence leads to +well-calibrated expressions of uncertainty, particularly for single-claim +answers. + +
+
+
+
+
+ + ☆ Decoding Style: Efficient Fine-Tuning of LLMs for Image-Guided Outfit + Recommendation with Preference CIKM 2024 + + +
+ Personalized outfit recommendation remains a complex challenge, demanding +both fashion compatibility understanding and trend awareness. This paper +presents a novel framework that harnesses the expressive power of large +language models (LLMs) for this task, mitigating their "black box" and static +nature through fine-tuning and direct feedback integration. We bridge the item +visual-textual gap in items descriptions by employing image captioning with a +Multimodal Large Language Model (MLLM). This enables the LLM to extract style +and color characteristics from human-curated fashion images, forming the basis +for personalized recommendations. The LLM is efficiently fine-tuned on the +open-source Polyvore dataset of curated fashion images, optimizing its ability +to recommend stylish outfits. A direct preference mechanism using negative +examples is employed to enhance the LLM's decision-making process. This creates +a self-enhancing AI feedback loop that continuously refines recommendations in +line with seasonal fashion trends. Our framework is evaluated on the Polyvore +dataset, demonstrating its effectiveness in two key tasks: fill-in-the-blank, +and complementary item retrieval. These evaluations underline the framework's +ability to generate stylish, trend-aligned outfit suggestions, continuously +improving through direct feedback. The evaluation results demonstrated that our +proposed framework significantly outperforms the base LLM, creating more +cohesive outfits. The improved performance in these tasks underscores the +proposed framework's potential to enhance the shopping experience with accurate +suggestions, proving its effectiveness over the vanilla LLM based outfit +generation. + +
+
+ comment: CIKM 2024 +
+
+
+
+
+ + ☆ GRIN: GRadient-INformed MoE + + +
+ Mixture-of-Experts (MoE) models scale more effectively than dense models due +to sparse computation through expert routing, selectively activating only a +small subset of expert modules. However, sparse computation challenges +traditional training practices, as discrete expert routing hinders standard +backpropagation and thus gradient-based optimization, which are the cornerstone +of deep learning. To better pursue the scaling power of MoE, we introduce GRIN +(GRadient-INformed MoE training), which incorporates sparse gradient estimation +for expert routing and configures model parallelism to avoid token dropping. +Applying GRIN to autoregressive language modeling, we develop a top-2 +16$\times$3.8B MoE model. Our model, with only 6.6B activated parameters, +outperforms a 7B dense model and matches the performance of a 14B dense model +trained on the same data. Extensive evaluations across diverse tasks +demonstrate the potential of GRIN to significantly enhance MoE efficacy, +achieving 79.4 on MMLU, 83.7 on HellaSwag, 74.4 on HumanEval, and 58.9 on MATH. + +
+
+ comment: 58 pages +
+
+
+
+
+ + ☆ Almost Sure Convergence of Linear Temporal Difference Learning with + Arbitrary Features + + +
+ Temporal difference (TD) learning with linear function approximation, +abbreviated as linear TD, is a classic and powerful prediction algorithm in +reinforcement learning. While it is well understood that linear TD converges +almost surely to a unique point, this convergence traditionally requires the +assumption that the features used by the approximator are linearly independent. +However, this linear independence assumption does not hold in many practical +scenarios. This work is the first to establish the almost sure convergence of +linear TD without requiring linearly independent features. In fact, we do not +make any assumptions on the features. We prove that the approximated value +function converges to a unique point and the weight iterates converge to a set. +We also establish a notion of local stability of the weight iterates. +Importantly, we do not need to introduce any other additional assumptions and +do not need to make any modification to the linear TD algorithm. Key to our +analysis is a novel characterization of bounded invariant sets of the mean ODE +of linear TD. + +
+
+ comment: 30 pages, 0 figures +
+
+
+
+
+ + ☆ Qwen2.5-Math Technical Report: Toward Mathematical Expert Model via + Self-Improvement + + +
+ In this report, we present a series of math-specific large language models: +Qwen2.5-Math and Qwen2.5-Math-Instruct-1.5B/7B/72B. The core innovation of the +Qwen2.5 series lies in integrating the philosophy of self-improvement +throughout the entire pipeline, from pre-training and post-training to +inference: (1) During the pre-training phase, Qwen2-Math-Instruct is utilized +to generate large-scale, high-quality mathematical data. (2) In the +post-training phase, we develop a reward model (RM) by conducting massive +sampling from Qwen2-Math-Instruct. This RM is then applied to the iterative +evolution of data in supervised fine-tuning (SFT). With a stronger SFT model, +it's possible to iteratively train and update the RM, which in turn guides the +next round of SFT data iteration. On the final SFT model, we employ the +ultimate RM for reinforcement learning, resulting in the Qwen2.5-Math-Instruct. +(3) Furthermore, during the inference stage, the RM is used to guide sampling, +optimizing the model's performance. + Qwen2.5-Math-Instruct supports both Chinese and English, and possess advanced +mathematical reasoning capabilities, including Chain-of-Thought (CoT) and +Tool-Integrated Reasoning (TIR). We evaluate our models on 10 mathematics +datasets in both English and Chinese, such as GSM8K, MATH, GaoKao, AMC23, and +AIME24, covering a range of difficulties from grade school level to math +competition problems. + +
+
+
+
+
+ + ☆ Stronger Baseline Models -- A Key Requirement for Aligning Machine + Learning Research with Clinical Utility + + +
+ Machine Learning (ML) research has increased substantially in recent years, +due to the success of predictive modeling across diverse application domains. +However, well-known barriers exist when attempting to deploy ML models in +high-stakes, clinical settings, including lack of model transparency (or the +inability to audit the inference process), large training data requirements +with siloed data sources, and complicated metrics for measuring model utility. +In this work, we show empirically that including stronger baseline models in +healthcare ML evaluations has important downstream effects that aid +practitioners in addressing these challenges. Through a series of case studies, +we find that the common practice of omitting baselines or comparing against a +weak baseline model (e.g. a linear model with no optimization) obscures the +value of ML methods proposed in the research literature. Using these insights, +we propose some best practices that will enable practitioners to more +effectively study and deploy ML models in clinical settings. + +
+
+ comment: 18 pages, 6 figures +
+
+
+
+
+ + ☆ Pareto Data Framework: Steps Towards Resource-Efficient Decision Making + Using Minimum Viable Data (MVD) + + +
+ This paper introduces the Pareto Data Framework, an approach for identifying +and selecting the Minimum Viable Data (MVD) required for enabling machine +learning applications on constrained platforms such as embedded systems, mobile +devices, and Internet of Things (IoT) devices. We demonstrate that strategic +data reduction can maintain high performance while significantly reducing +bandwidth, energy, computation, and storage costs. The framework identifies +Minimum Viable Data (MVD) to optimize efficiency across resource-constrained +environments without sacrificing performance. It addresses common inefficient +practices in an IoT application such as overprovisioning of sensors and +overprecision, and oversampling of signals, proposing scalable solutions for +optimal sensor selection, signal extraction and transmission, and data +representation. An experimental methodology demonstrates effective acoustic +data characterization after downsampling, quantization, and truncation to +simulate reduced-fidelity sensors and network and storage constraints; results +shows that performance can be maintained up to 95\% with sample rates reduced +by 75\% and bit depths and clip length reduced by 50\% which translates into +substantial cost and resource reduction. These findings have implications on +the design and development of constrained systems. The paper also discusses +broader implications of the framework, including the potential to democratize +advanced AI technologies across IoT applications and sectors such as +agriculture, transportation, and manufacturing to improve access and multiply +the benefits of data-driven insights. + +
+
+
+
+
+ + ☆ FedLF: Adaptive Logit Adjustment and Feature Optimization in Federated + Long-Tailed Learning ACML 2024 + + +
+ Federated learning offers a paradigm to the challenge of preserving privacy +in distributed machine learning. However, datasets distributed across each +client in the real world are inevitably heterogeneous, and if the datasets can +be globally aggregated, they tend to be long-tailed distributed, which greatly +affects the performance of the model. The traditional approach to federated +learning primarily addresses the heterogeneity of data among clients, yet it +fails to address the phenomenon of class-wise bias in global long-tailed data. +This results in the trained model focusing on the head classes while neglecting +the equally important tail classes. Consequently, it is essential to develop a +methodology that considers classes holistically. To address the above problems, +we propose a new method FedLF, which introduces three modifications in the +local training phase: adaptive logit adjustment, continuous class centred +optimization, and feature decorrelation. We compare seven state-of-the-art +methods with varying degrees of data heterogeneity and long-tailed +distribution. Extensive experiments on benchmark datasets CIFAR-10-LT and +CIFAR-100-LT demonstrate that our approach effectively mitigates the problem of +model performance degradation due to data heterogeneity and long-tailed +distribution. our code is available at https://github.com/18sym/FedLF. + +
+
+ comment: Accepted by ACML 2024 +
+
+
+
+
+ + ☆ Symmetry-Enriched Learning: A Category-Theoretic Framework for Robust + Machine Learning Models + + +
+ This manuscript presents a novel framework that integrates higher-order +symmetries and category theory into machine learning. We introduce new +mathematical constructs, including hyper-symmetry categories and functorial +representations, to model complex transformations within learning algorithms. +Our contributions include the design of symmetry-enriched learning models, the +development of advanced optimization techniques leveraging categorical +symmetries, and the theoretical analysis of their implications for model +robustness, generalization, and convergence. Through rigorous proofs and +practical applications, we demonstrate that incorporating higher-dimensional +categorical structures enhances both the theoretical foundations and practical +capabilities of modern machine learning algorithms, opening new directions for +research and innovation. + +
+
+
+
+
+ + ☆ Towards Interpretable End-Stage Renal Disease (ESRD) Prediction: + Utilizing Administrative Claims Data with Explainable AI Techniques + + +
+ This study explores the potential of utilizing administrative claims data, +combined with advanced machine learning and deep learning techniques, to +predict the progression of Chronic Kidney Disease (CKD) to End-Stage Renal +Disease (ESRD). We analyze a comprehensive, 10-year dataset provided by a major +health insurance organization to develop prediction models for multiple +observation windows using traditional machine learning methods such as Random +Forest and XGBoost as well as deep learning approaches such as Long Short-Term +Memory (LSTM) networks. Our findings demonstrate that the LSTM model, +particularly with a 24-month observation window, exhibits superior performance +in predicting ESRD progression, outperforming existing models in the +literature. We further apply SHapley Additive exPlanations (SHAP) analysis to +enhance interpretability, providing insights into the impact of individual +features on predictions at the individual patient level. This study underscores +the value of leveraging administrative claims data for CKD management and +predicting ESRD progression. + +
+
+ comment: 10pages, 4 figures, AMIA 2024 +
+
+
+
+
+ + ☆ Denoising diffusion models for high-resolution microscopy image + restoration + + +
+ Advances in microscopy imaging enable researchers to visualize structures at +the nanoscale level thereby unraveling intricate details of biological +organization. However, challenges such as image noise, photobleaching of +fluorophores, and low tolerability of biological samples to high light doses +remain, restricting temporal resolutions and experiment durations. Reduced +laser doses enable longer measurements at the cost of lower resolution and +increased noise, which hinders accurate downstream analyses. Here we train a +denoising diffusion probabilistic model (DDPM) to predict high-resolution +images by conditioning the model on low-resolution information. Additionally, +the probabilistic aspect of the DDPM allows for repeated generation of images +that tend to further increase the signal-to-noise ratio. We show that our model +achieves a performance that is better or similar to the previously +best-performing methods, across four highly diverse datasets. Importantly, +while any of the previous methods show competitive performance for some, but +not all datasets, our method consistently achieves high performance across all +four data sets, suggesting high generalizability. + +
+
+
+
+
+ + ☆ Unsupervised Domain Adaptation Via Data Pruning + + +
+ The removal of carefully-selected examples from training data has recently +emerged as an effective way of improving the robustness of machine learning +models. However, the best way to select these examples remains an open +question. In this paper, we consider the problem from the perspective of +unsupervised domain adaptation (UDA). We propose AdaPrune, a method for UDA +whereby training examples are removed to attempt to align the training +distribution to that of the target data. By adopting the maximum mean +discrepancy (MMD) as the criterion for alignment, the problem can be neatly +formulated and solved as an integer quadratic program. We evaluate our approach +on a real-world domain shift task of bioacoustic event detection. As a method +for UDA, we show that AdaPrune outperforms related techniques, and is +complementary to other UDA algorithms such as CORAL. Our analysis of the +relationship between the MMD and model accuracy, along with t-SNE plots, +validate the proposed method as a principled and well-founded way of performing +data pruning. + +
+
+
+
+
+ + ☆ Fitting Multilevel Factor Models + + +
+ We examine a special case of the multilevel factor model, with covariance +given by multilevel low rank (MLR) matrix~\cite{parshakova2023factor}. We +develop a novel, fast implementation of the expectation-maximization (EM) +algorithm, tailored for multilevel factor models, to maximize the likelihood of +the observed data. This method accommodates any hierarchical structure and +maintains linear time and storage complexities per iteration. This is achieved +through a new efficient technique for computing the inverse of the positive +definite MLR matrix. We show that the inverse of an invertible PSD MLR matrix +is also an MLR matrix with the same sparsity in factors, and we use the +recursive Sherman-Morrison-Woodbury matrix identity to obtain the factors of +the inverse. Additionally, we present an algorithm that computes the Cholesky +factorization of an expanded matrix with linear time and space complexities, +yielding the covariance matrix as its Schur complement. This paper is +accompanied by an open-source package that implements the proposed methods. + +
+
+
+
+
+ + ☆ Dual-Layer Training and Decoding of Large Language Model with + Simultaneously Thinking and Speaking + + +
+ Large Language Model can reasonably understand and generate human expressions +but may lack of thorough thinking and reasoning mechanisms. Recently there have +been several studies which enhance the thinking ability of language models but +most of them are not data-driven or training-based. In this paper, we are +motivated by the cognitive mechanism in the natural world, and design a novel +model architecture called TaS which allows it to first consider the thoughts +and then express the response based upon the query. We design several pipelines +to annotate or generate the thought contents from prompt-response samples, then +add language heads in a middle layer which behaves as the thinking layer. We +train the language model by the thoughts-augmented data and successfully let +the thinking layer automatically generate reasonable thoughts and finally +output more reasonable responses. Both qualitative examples and quantitative +results validate the effectiveness and performance of TaS. Our code is +available at https://anonymous.4open.science/r/TadE. + +
+
+ comment: 9 pages, 5 figures +
+
+
+
+
+ + ☆ Cartan moving frames and the data manifolds + + +
+ The purpose of this paper is to employ the language of Cartan moving frames +to study the geometry of the data manifolds and its Riemannian structure, via +the data information metric and its curvature at data points. Using this +framework and through experiments, explanations on the response of a neural +network are given by pointing out the output classes that are easily reachable +from a given input. This emphasizes how the proposed mathematical relationship +between the output of the network and the geometry of its inputs can be +exploited as an explainable artificial intelligence tool. + +
+
+
+
+
+ + ☆ Extended Deep Submodular Functions + + +
+ We introduce a novel category of set functions called Extended Deep +Submodular functions (EDSFs), which are neural network-representable. EDSFs +serve as an extension of Deep Submodular Functions (DSFs), inheriting crucial +properties from DSFs while addressing innate limitations. It is known that DSFs +can represent a limiting subset of submodular functions. In contrast, through +an analysis of polymatroid properties, we establish that EDSFs possess the +capability to represent all monotone submodular functions, a notable +enhancement compared to DSFs. Furthermore, our findings demonstrate that EDSFs +can represent any monotone set function, indicating the family of EDSFs is +equivalent to the family of all monotone set functions. Additionally, we prove +that EDSFs maintain the concavity inherent in DSFs when the components of the +input vector are non-negative real numbers-an essential feature in certain +combinatorial optimization problems. Through extensive experiments, we +illustrate that EDSFs exhibit significantly lower empirical generalization +error than DSFs in the learning of coverage functions. This suggests that EDSFs +present a promising advancement in the representation and learning of set +functions with improved generalization capabilities. + +
+
+
+
+
+ + ☆ Understanding the Effects of the Baidu-ULTR Logging Policy on Two-Tower + Models RecSys + '24 + + +
+ Despite the popularity of the two-tower model for unbiased learning to rank +(ULTR) tasks, recent work suggests that it suffers from a major limitation that +could lead to its collapse in industry applications: the problem of logging +policy confounding. Several potential solutions have even been proposed; +however, the evaluation of these methods was mostly conducted using +semi-synthetic simulation experiments. This paper bridges the gap between +theory and practice by investigating the confounding problem on the largest +real-world dataset, Baidu-ULTR. Our main contributions are threefold: 1) we +show that the conditions for the confounding problem are given on Baidu-ULTR, +2) the confounding problem bears no significant effect on the two-tower model, +and 3) we point to a potential mismatch between expert annotations, the golden +standard in ULTR, and user click behavior. + +
+
+ comment: Accepted at the CONSEQUENCES '24 workshop, co-located with ACM RecSys + '24 +
+
+
+
+
+ + ☆ A Unified Framework for Neural Computation and Learning Over Time + + +
+ This paper proposes Hamiltonian Learning, a novel unified framework for +learning with neural networks "over time", i.e., from a possibly infinite +stream of data, in an online manner, without having access to future +information. Existing works focus on the simplified setting in which the stream +has a known finite length or is segmented into smaller sequences, leveraging +well-established learning strategies from statistical machine learning. In this +paper, the problem of learning over time is rethought from scratch, leveraging +tools from optimal control theory, which yield a unifying view of the temporal +dynamics of neural computations and learning. Hamiltonian Learning is based on +differential equations that: (i) can be integrated without the need of external +software solvers; (ii) generalize the well-established notion of gradient-based +learning in feed-forward and recurrent networks; (iii) open to novel +perspectives. The proposed framework is showcased by experimentally proving how +it can recover gradient-based learning, comparing it to out-of-the box +optimizers, and describing how it is flexible enough to switch from fully-local +to partially/non-local computational schemes, possibly distributed over +multiple devices, and BackPropagation without storing activations. Hamiltonian +Learning is easy to implement and can help researches approach in a principled +and innovative manner the problem of learning over time. + +
+
+
+
+
+ + ☆ Topological Deep Learning with State-Space Models: A Mamba Approach for + Simplicial Complexes + + +
+ Graph Neural Networks based on the message-passing (MP) mechanism are a +dominant approach for handling graph-structured data. However, they are +inherently limited to modeling only pairwise interactions, making it difficult +to explicitly capture the complexity of systems with $n$-body relations. To +address this, topological deep learning has emerged as a promising field for +studying and modeling higher-order interactions using various topological +domains, such as simplicial and cellular complexes. While these new domains +provide powerful representations, they introduce new challenges, such as +effectively modeling the interactions among higher-order structures through +higher-order MP. Meanwhile, structured state-space sequence models have proven +to be effective for sequence modeling and have recently been adapted for graph +data by encoding the neighborhood of a node as a sequence, thereby avoiding the +MP mechanism. In this work, we propose a novel architecture designed to operate +with simplicial complexes, utilizing the Mamba state-space model as its +backbone. Our approach generates sequences for the nodes based on the +neighboring cells, enabling direct communication between all higher-order +structures, regardless of their rank. We extensively validate our model, +demonstrating that it achieves competitive performance compared to +state-of-the-art models developed for simplicial complexes. + +
+
+
+
+
+ + ☆ On Vision Transformers for Classification Tasks in Side-Scan Sonar + Imagery + + +
+ Side-scan sonar (SSS) imagery presents unique challenges in the +classification of man-made objects on the seafloor due to the complex and +varied underwater environments. Historically, experts have manually interpreted +SSS images, relying on conventional machine learning techniques with +hand-crafted features. While Convolutional Neural Networks (CNNs) significantly +advanced automated classification in this domain, they often fall short when +dealing with diverse seafloor textures, such as rocky or ripple sand bottoms, +where false positive rates may increase. Recently, Vision Transformers (ViTs) +have shown potential in addressing these limitations by utilizing a +self-attention mechanism to capture global information in image patches, +offering more flexibility in processing spatial hierarchies. This paper +rigorously compares the performance of ViT models alongside commonly used CNN +architectures, such as ResNet and ConvNext, for binary classification tasks in +SSS imagery. The dataset encompasses diverse geographical seafloor types and is +balanced between the presence and absence of man-made objects. ViT-based models +exhibit superior classification performance across f1-score, precision, recall, +and accuracy metrics, although at the cost of greater computational resources. +CNNs, with their inductive biases, demonstrate better computational efficiency, +making them suitable for deployment in resource-constrained environments like +underwater vehicles. Future research directions include exploring +self-supervised learning for ViTs and multi-modal fusion to further enhance +performance in challenging underwater environments. + +
+
+
+
+
+ + ☆ Promise and Peril of Collaborative Code Generation Models: Balancing + Effectiveness and Memorization + + +
+ In the rapidly evolving field of machine learning, training models with +datasets from various locations and organizations presents significant +challenges due to privacy and legal concerns. The exploration of effective +collaborative training settings capable of leveraging valuable knowledge from +distributed and isolated datasets is increasingly crucial. This study +investigates key factors that impact the effectiveness of collaborative +training methods in code next-token prediction, as well as the correctness and +utility of the generated code, demonstrating the promise of such methods. +Additionally, we evaluate the memorization of different participant training +data across various collaborative training settings, including centralized, +federated, and incremental training, highlighting their potential risks in +leaking data. Our findings indicate that the size and diversity of code +datasets are pivotal factors influencing the success of collaboratively trained +code models. We show that federated learning achieves competitive performance +compared to centralized training while offering better data protection, as +evidenced by lower memorization ratios in the generated code. However, +federated learning can still produce verbatim code snippets from hidden +training data, potentially violating privacy or copyright. Our study further +explores effectiveness and memorization patterns in incremental learning, +emphasizing the sequence in which individual participant datasets are +introduced. We also identify cross-organizational clones as a prevalent +challenge in both centralized and federated learning scenarios. Our findings +highlight the persistent risk of data leakage during inference, even when +training data remains unseen. We conclude with recommendations for +practitioners and researchers to optimize multisource datasets, propelling +cross-organizational collaboration forward. + +
+
+ comment: Paper accepted to the ASE 2024 Conference Research Track +
+
+
+
+
+ + ☆ All-in-one foundational models learning across quantum chemical levels + + +
+ Machine learning (ML) potentials typically target a single quantum chemical +(QC) level while the ML models developed for multi-fidelity learning have not +been shown to provide scalable solutions for foundational models. Here we +introduce the all-in-one (AIO) ANI model architecture based on multimodal +learning which can learn an arbitrary number of QC levels. Our all-in-one +learning approach offers a more general and easier-to-use alternative to +transfer learning. We use it to train the AIO-ANI-UIP foundational model with +the generalization capability comparable to semi-empirical GFN2-xTB and DFT +with a double-zeta basis set for organic molecules. We show that the AIO-ANI +model can learn across different QC levels ranging from semi-empirical to +density functional theory to coupled cluster. We also use AIO models to design +the foundational model {\Delta}-AIO-ANI based on {\Delta}-learning with +increased accuracy and robustness compared to AIO-ANI-UIP. The code and the +foundational models are available at https://github.com/dralgroup/aio-ani; they +will be integrated into the universal and updatable AI-enhanced QM (UAIQM) +library and made available in the MLatom package so that they can be used +online at the XACS cloud computing platform (see +https://github.com/dralgroup/mlatom for updates). + +
+
+
+
+
+ + ☆ Putting Data at the Centre of Offline Multi-Agent Reinforcement Learning + + +
+ Offline multi-agent reinforcement learning (MARL) is an exciting direction of +research that uses static datasets to find optimal control policies for +multi-agent systems. Though the field is by definition data-driven, efforts +have thus far neglected data in their drive to achieve state-of-the-art +results. We first substantiate this claim by surveying the literature, showing +how the majority of works generate their own datasets without consistent +methodology and provide sparse information about the characteristics of these +datasets. We then show why neglecting the nature of the data is problematic, +through salient examples of how tightly algorithmic performance is coupled to +the dataset used, necessitating a common foundation for experiments in the +field. In response, we take a big step towards improving data usage and data +awareness in offline MARL, with three key contributions: (1) a clear guideline +for generating novel datasets; (2) a standardisation of over 80 existing +datasets, hosted in a publicly available repository, using a consistent storage +format and easy-to-use API; and (3) a suite of analysis tools that allow us to +understand these datasets better, aiding further development. + +
+
+
+
+
+ + ☆ "It Might be Technically Impressive, But It's Practically Useless to + Us": Practices, Challenges, and Opportunities for Cross-Functional + Collaboration around AI within the News Industry + + +
+ Recently, an increasing number of news organizations have integrated +artificial intelligence (AI) into their workflows, leading to a further influx +of AI technologists and data workers into the news industry. This has initiated +cross-functional collaborations between these professionals and journalists. +While prior research has explored the impact of AI-related roles entering the +news industry, there is a lack of studies on how cross-functional collaboration +unfolds between AI professionals and journalists. Through interviews with 17 +journalists, 6 AI technologists, and 3 AI workers with cross-functional +experience from leading news organizations, we investigate the current +practices, challenges, and opportunities for cross-functional collaboration +around AI in today's news industry. We first study how journalists and AI +professionals perceive existing cross-collaboration strategies. We further +explore the challenges of cross-functional collaboration and provide +recommendations for enhancing future cross-functional collaboration around AI +in the news industry. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Unraveling the Hessian: A Key to Smooth Convergence in Loss Function + Landscapes + + +
+ The loss landscape of neural networks is a critical aspect of their training, +and understanding its properties is essential for improving their performance. +In this paper, we investigate how the loss surface changes when the sample size +increases, a previously unexplored issue. We theoretically analyze the +convergence of the loss landscape in a fully connected neural network and +derive upper bounds for the difference in loss function values when adding a +new object to the sample. Our empirical study confirms these results on various +datasets, demonstrating the convergence of the loss function surface for image +classification tasks. Our findings provide insights into the local geometry of +neural loss landscapes and have implications for the development of sample size +determination techniques. + +
+
+
+
+
+ + ☆ An Efficient Model-Agnostic Approach for Uncertainty Estimation in + Data-Restricted Pedometric Applications ICML + + +
+ This paper introduces a model-agnostic approach designed to enhance +uncertainty estimation in the predictive modeling of soil properties, a crucial +factor for advancing pedometrics and the practice of digital soil mapping. For +addressing the typical challenge of data scarcity in soil studies, we present +an improved technique for uncertainty estimation. This method is based on the +transformation of regression tasks into classification problems, which not only +allows for the production of reliable uncertainty estimates but also enables +the application of established machine learning algorithms with competitive +performance that have not yet been utilized in pedometrics. Empirical results +from datasets collected from two German agricultural fields showcase the +practical application of the proposed methodology. Our results and findings +suggest that the proposed approach has the potential to provide better +uncertainty estimation than the models commonly used in pedometrics. + +
+
+ comment: To be published in the proceedings of ICMLA 2024: 23rd International + Conference on Machine Learning and Applications +
+
+
+
+
+ + ☆ Metric-Semantic Factor Graph Generation based on Graph Neural Networks ICRA 2025 + + +
+ Understanding the relationships between geometric structures and semantic +concepts is crucial for building accurate models of complex environments. In +indoors, certain spatial constraints, such as the relative positioning of +planes, remain consistent despite variations in layout. This paper explores how +these invariant relationships can be captured in a graph SLAM framework by +representing high-level concepts like rooms and walls, linking them to +geometric elements like planes through an optimizable factor graph. Several +efforts have tackled this issue with add-hoc solutions for each concept +generation and with manually-defined factors. + This paper proposes a novel method for metric-semantic factor graph +generation which includes defining a semantic scene graph, integrating +geometric information, and learning the interconnecting factors, all based on +Graph Neural Networks (GNNs). An edge classification network (G-GNN) sorts the +edges between planes into same room, same wall or none types. The resulting +relations are clustered, generating a room or wall for each cluster. A second +family of networks (F-GNN) infers the geometrical origin of the new nodes. The +definition of the factors employs the same F-GNN used for the metric attribute +of the generated nodes. Furthermore, share the new factor graph with the +S-Graphs+ algorithm, extending its graph expressiveness and scene +representation with the ultimate goal of improving the SLAM performance. The +complexity of the environments is increased to N-plane rooms by training the +networks on L-shaped rooms. The framework is evaluated in synthetic and +simulated scenarios as no real datasets of the required complex layouts are +available. + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ☆ Efficacy of Synthetic Data as a Benchmark + + +
+ Large language models (LLMs) have enabled a range of applications in +zero-shot and few-shot learning settings, including the generation of synthetic +datasets for training and testing. However, to reliably use these synthetic +datasets, it is essential to understand how representative they are of +real-world data. We investigate this by assessing the effectiveness of +generating synthetic data through LLM and using it as a benchmark for various +NLP tasks. Our experiments across six datasets, and three different tasks, show +that while synthetic data can effectively capture performance of various +methods for simpler tasks, such as intent classification, it falls short for +more complex tasks like named entity recognition. Additionally, we propose a +new metric called the bias factor, which evaluates the biases introduced when +the same LLM is used to both generate benchmarking data and to perform the +tasks. We find that smaller LLMs exhibit biases towards their own generated +data, whereas larger models do not. Overall, our findings suggest that the +effectiveness of synthetic data as a benchmark varies depending on the task, +and that practitioners should rely on data generated from multiple larger +models whenever possible. + +
+
+
+
+
+ + ☆ Data Efficient Acoustic Scene Classification using Teacher-Informed + Confusing Class Instruction + + +
+ In this technical report, we describe the SNTL-NTU team's submission for Task +1 Data-Efficient Low-Complexity Acoustic Scene Classification of the detection +and classification of acoustic scenes and events (DCASE) 2024 challenge. Three +systems are introduced to tackle training splits of different sizes. For small +training splits, we explored reducing the complexity of the provided baseline +model by reducing the number of base channels. We introduce data augmentation +in the form of mixup to increase the diversity of training samples. For the +larger training splits, we use FocusNet to provide confusing class information +to an ensemble of multiple Patchout faSt Spectrogram Transformer (PaSST) models +and baseline models trained on the original sampling rate of 44.1 kHz. We use +Knowledge Distillation to distill the ensemble model to the baseline student +model. Training the systems on the TAU Urban Acoustic Scene 2022 Mobile +development dataset yielded the highest average testing accuracy of (62.21, +59.82, 56.81, 53.03, 47.97)% on split (100, 50, 25, 10, 5)% respectively over +the three systems. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ Reinforcement Learning with Lie Group Orientations for Robotics ICRA 2025 + + +
+ Handling orientations of robots and objects is a crucial aspect of many +applications. Yet, ever so often, there is a lack of mathematical correctness +when dealing with orientations, especially in learning pipelines involving, for +example, artificial neural networks. In this paper, we investigate +reinforcement learning with orientations and propose a simple modification of +the network's input and output that adheres to the Lie group structure of +orientations. As a result, we obtain an easy and efficient implementation that +is directly usable with existing learning libraries and achieves significantly +better performance than other common orientation representations. We briefly +introduce Lie theory specifically for orientations in robotics to motivate and +outline our approach. Subsequently, a thorough empirical evaluation of +different combinations of orientation representations for states and actions +demonstrates the superior performance of our proposed approach in different +scenarios, including: direct orientation control, end effector orientation +control, and pick-and-place tasks. + +
+
+ comment: Submitted to ICRA 2025 +
+
+
+
+
+ + ☆ Reinforcement Learning as an Improvement Heuristic for Real-World + Production Scheduling ICML + + +
+ The integration of Reinforcement Learning (RL) with heuristic methods is an +emerging trend for solving optimization problems, which leverages RL's ability +to learn from the data generated during the search process. One promising +approach is to train an RL agent as an improvement heuristic, starting with a +suboptimal solution that is iteratively improved by applying small changes. We +apply this approach to a real-world multiobjective production scheduling +problem. Our approach utilizes a network architecture that includes Transformer +encoding to learn the relationships between jobs. Afterwards, a probability +matrix is generated from which pairs of jobs are sampled and then swapped to +improve the solution. We benchmarked our approach against other heuristics +using real data from our industry partner, demonstrating its superior +performance. + +
+
+ comment: This paper was accepted at the ICMLA 2024 +
+
+
+
+
+ + ☆ An Explainable Machine Learning Approach to Traffic Accident Fatality + Prediction + + +
+ Road traffic accidents (RTA) pose a significant public health threat +worldwide, leading to considerable loss of life and economic burdens. This is +particularly acute in developing countries like Bangladesh. Building reliable +models to forecast crash outcomes is crucial for implementing effective +preventive measures. To aid in developing targeted safety interventions, this +study presents a machine learning-based approach for classifying fatal and +non-fatal road accident outcomes using data from the Dhaka metropolitan traffic +crash database from 2017 to 2022. Our framework utilizes a range of machine +learning classification algorithms, comprising Logistic Regression, Support +Vector Machines, Naive Bayes, Random Forest, Decision Tree, Gradient Boosting, +LightGBM, and Artificial Neural Network. We prioritize model interpretability +by employing the SHAP (SHapley Additive exPlanations) method, which elucidates +the key factors influencing accident fatality. Our results demonstrate that +LightGBM outperforms other models, achieving a ROC-AUC score of 0.72. The +global, local, and feature dependency analyses are conducted to acquire deeper +insights into the behavior of the model. SHAP analysis reveals that casualty +class, time of accident, location, vehicle type, and road type play pivotal +roles in determining fatality risk. These findings offer valuable insights for +policymakers and road safety practitioners in developing countries, enabling +the implementation of evidence-based strategies to reduce traffic crash +fatalities. + +
+
+ comment: 10 Pages, 6 figures, 2 tables, 28th International Conference on + Knowledge-Based and Intelligent Information & Engineering Systems (KES 2024) +
+
+
+
+
+ + ☆ Generation of Complex 3D Human Motion by Temporal and Spatial + Composition of Diffusion Models + + +
+ In this paper, we address the challenge of generating realistic 3D human +motions for action classes that were never seen during the training phase. Our +approach involves decomposing complex actions into simpler movements, +specifically those observed during training, by leveraging the knowledge of +human motion contained in GPTs models. These simpler movements are then +combined into a single, realistic animation using the properties of diffusion +models. Our claim is that this decomposition and subsequent recombination of +simple movements can synthesize an animation that accurately represents the +complex input action. This method operates during the inference phase and can +be integrated with any pre-trained diffusion model, enabling the synthesis of +motion classes not present in the training data. We evaluate our method by +dividing two benchmark human motion datasets into basic and complex actions, +and then compare its performance against the state-of-the-art. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ Less Memory Means smaller GPUs: Backpropagation with Compressed + Activations ECML + + +
+ The ever-growing scale of deep neural networks (DNNs) has lead to an equally +rapid growth in computational resource requirements. Many recent architectures, +most prominently Large Language Models, have to be trained using supercomputers +with thousands of accelerators, such as GPUs or TPUs. Next to the vast number +of floating point operations the memory footprint of DNNs is also exploding. In +contrast, GPU architectures are notoriously short on memory. Even comparatively +small architectures like some EfficientNet variants cannot be trained on a +single consumer-grade GPU at reasonable mini-batch sizes. During training, +intermediate input activations have to be stored until backpropagation for +gradient calculation. These make up the vast majority of the memory footprint. +In this work we therefore consider compressing activation maps for the backward +pass using pooling, which can reduce both the memory footprint and amount of +data movement. The forward computation remains uncompressed. We empirically +show convergence and study effects on feature detection at the example of the +common vision architecture ResNet. With this approach we are able to reduce the +peak memory consumption by 29% at the cost of a longer training schedule, while +maintaining prediction accuracy compared to an uncompressed baseline. + +
+
+ comment: Presented at ITEM workshop co-located with ECML PKDD 2024, Vilnius LT +
+
+
+
+
+ + ☆ Multi-Grid Graph Neural Networks with Self-Attention for Computational + Mechanics + + +
+ Advancement in finite element methods have become essential in various +disciplines, and in particular for Computational Fluid Dynamics (CFD), driving +research efforts for improved precision and efficiency. While Convolutional +Neural Networks (CNNs) have found success in CFD by mapping meshes into images, +recent attention has turned to leveraging Graph Neural Networks (GNNs) for +direct mesh processing. This paper introduces a novel model merging +Self-Attention with Message Passing in GNNs, achieving a 15\% reduction in RMSE +on the well known flow past a cylinder benchmark. Furthermore, a dynamic mesh +pruning technique based on Self-Attention is proposed, that leads to a robust +GNN-based multigrid approach, also reducing RMSE by 15\%. Additionally, a new +self-supervised training method based on BERT is presented, resulting in a 25\% +RMSE reduction. The paper includes an ablation study and outperforms +state-of-the-art models on several challenging datasets, promising advancements +similar to those recently achieved in natural language and image processing. +Finally, the paper introduces a dataset with meshes larger than existing ones +by at least an order of magnitude. Code and Datasets will be released at +https://github.com/DonsetPG/multigrid-gnn. + +
+
+
+
+
+ + ☆ Secure Control Systems for Autonomous Quadrotors against Cyber-Attacks + + +
+ The problem of safety for robotic systems has been extensively studied. +However, little attention has been given to security issues for +three-dimensional systems, such as quadrotors. Malicious adversaries can +compromise robot sensors and communication networks, causing incidents, +achieving illegal objectives, or even injuring people. This study first designs +an intelligent control system for autonomous quadrotors. Then, it investigates +the problems of optimal false data injection attack scheduling and +countermeasure design for unmanned aerial vehicles. Using a state-of-the-art +deep learning-based approach, an optimal false data injection attack scheme is +proposed to deteriorate a quadrotor's tracking performance with limited attack +energy. Subsequently, an optimal tracking control strategy is learned to +mitigate attacks and recover the quadrotor's tracking performance. We base our +work on Agilicious, a state-of-the-art quadrotor recently deployed for +autonomous settings. This paper is the first in the United Kingdom to deploy +this quadrotor and implement reinforcement learning on its platform. Therefore, +to promote easy reproducibility with minimal engineering overhead, we further +provide (1) a comprehensive breakdown of this quadrotor, including software +stacks and hardware alternatives; (2) a detailed reinforcement-learning +framework to train autonomous controllers on Agilicious agents; and (3) a new +open-source environment that builds upon PyFlyt for future reinforcement +learning research on Agilicious platforms. Both simulated and real-world +experiments are conducted to show the effectiveness of the proposed frameworks +in section 5.2. + +
+
+ comment: The paper is based on an undergraduate thesis and is not intended for + publication in a journal +
+
+
+
+
+ + ☆ Location based Probabilistic Load Forecasting of EV Charging Sites: Deep + Transfer Learning with Multi-Quantile Temporal Convolutional Network + + +
+ Electrification of vehicles is a potential way of reducing fossil fuel usage +and thus lessening environmental pollution. Electric Vehicles (EVs) of various +types for different transport modes (including air, water, and land) are +evolving. Moreover, different EV user groups (commuters, commercial or domestic +users, drivers) may use different charging infrastructures (public, private, +home, and workplace) at various times. Therefore, usage patterns and energy +demand are very stochastic. Characterizing and forecasting the charging demand +of these diverse EV usage profiles is essential in preventing power outages. +Previously developed data-driven load models are limited to specific use cases +and locations. None of these models are simultaneously adaptive enough to +transfer knowledge of day-ahead forecasting among EV charging sites of diverse +locations, trained with limited data, and cost-effective. This article presents +a location-based load forecasting of EV charging sites using a deep +Multi-Quantile Temporal Convolutional Network (MQ-TCN) to overcome the +limitations of earlier models. We conducted our experiments on data from four +charging sites, namely Caltech, JPL, Office-1, and NREL, which have diverse EV +user types like students, full-time and part-time employees, random visitors, +etc. With a Prediction Interval Coverage Probability (PICP) score of 93.62\%, +our proposed deep MQ-TCN model exhibited a remarkable 28.93\% improvement over +the XGBoost model for a day-ahead load forecasting at the JPL charging site. By +transferring knowledge with the inductive Transfer Learning (TL) approach, the +MQ-TCN model achieved a 96.88\% PICP score for the load forecasting task at the +NREL site using only two weeks of data. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ☆ Tight and Efficient Upper Bound on Spectral Norm of Convolutional Layers ECCV 2024 + + +
+ Controlling the spectral norm of the Jacobian matrix, which is related to the +convolution operation, has been shown to improve generalization, training +stability and robustness in CNNs. Existing methods for computing the norm +either tend to overestimate it or their performance may deteriorate quickly +with increasing the input and kernel sizes. In this paper, we demonstrate that +the tensor version of the spectral norm of a four-dimensional convolution +kernel, up to a constant factor, serves as an upper bound for the spectral norm +of the Jacobian matrix associated with the convolution operation. This new +upper bound is independent of the input image resolution, differentiable and +can be efficiently calculated during training. Through experiments, we +demonstrate how this new bound can be used to improve the performance of +convolutional architectures. + +
+
+ comment: ECCV 2024 +
+
+
+
+
+ + ☆ Edge-Based Graph Component Pooling ECML + + +
+ Graph-structured data naturally occurs in many research fields, such as +chemistry and sociology. The relational information contained therein can be +leveraged to statistically model graph properties through geometrical deep +learning. Graph neural networks employ techniques, such as message-passing +layers, to propagate local features through a graph. However, message-passing +layers can be computationally expensive when dealing with large and sparse +graphs. Graph pooling operators offer the possibility of removing or merging +nodes in such graphs, thus lowering computational costs. However, pooling +operators that remove nodes cause data loss, and pooling operators that merge +nodes are often computationally expensive. We propose a pooling operator that +merges nodes so as not to cause data loss but is also conceptually simple and +computationally inexpensive. We empirically demonstrate that the proposed +pooling operator performs statistically significantly better than edge pool on +four popular benchmark datasets while reducing time complexity and the number +of trainable parameters by 70.6% on average. Compared to another maximally +powerful method named Graph Isomporhic Network, we show that we outperform them +on two popular benchmark datasets while reducing the number of learnable +parameters on average by 60.9%. + +
+
+ comment: 15 pages, presented at 21st International Workshop on Mining and + Learning with Graphs, AstraZenica Bio & Healthcare award Paper, ECML PKDD + 2024 Vilnius +
+
+
+
+
+ + ☆ An efficient wavelet-based physics-informed neural networks for + singularly perturbed problems + + +
+ Physics-informed neural networks (PINNs) are a class of deep learning models +that utilize physics as differential equations to address complex problems, +including ones that may involve limited data availability. However, tackling +solutions of differential equations with oscillations or singular perturbations +and shock-like structures becomes challenging for PINNs. Considering these +challenges, we designed an efficient wavelet-based PINNs (W-PINNs) model to +solve singularly perturbed differential equations. Here, we represent the +solution in wavelet space using a family of smooth-compactly supported +wavelets. This framework represents the solution of a differential equation +with significantly fewer degrees of freedom while still retaining in capturing, +identifying, and analyzing the local structure of complex physical phenomena. +The architecture allows the training process to search for a solution within +wavelet space, making the process faster and more accurate. The proposed model +does not rely on automatic differentiations for derivatives involved in +differential equations and does not require any prior information regarding the +behavior of the solution, such as the location of abrupt features. Thus, +through a strategic fusion of wavelets with PINNs, W-PINNs excel at capturing +localized nonlinear information, making them well-suited for problems showing +abrupt behavior in certain regions, such as singularly perturbed problems. The +efficiency and accuracy of the proposed neural network model are demonstrated +in various test problems, i.e., highly singularly perturbed nonlinear +differential equations, the FitzHugh-Nagumo (FHN), and Predator-prey +interaction models. The proposed design model exhibits impressive comparisons +with traditional PINNs and the recently developed wavelet-based PINNs, which +use wavelets as an activation function for solving nonlinear differential +equations. + +
+
+ comment: 17 pages, 12 figures +
+
+
+
+
+ + ☆ Graph Neural Network-State Predictive Information Bottleneck (GNN-SPIB) + approach for learning molecular thermodynamics and kinetics + + +
+ Molecular dynamics simulations offer detailed insights into atomic motions +but face timescale limitations. Enhanced sampling methods have addressed these +challenges but even with machine learning, they often rely on pre-selected +expert-based features. In this work, we present the Graph Neural Network-State +Predictive Information Bottleneck (GNN-SPIB) framework, which combines graph +neural networks and the State Predictive Information Bottleneck to +automatically learn low-dimensional representations directly from atomic +coordinates. Tested on three benchmark systems, our approach predicts essential +structural, thermodynamic and kinetic information for slow processes, +demonstrating robustness across diverse systems. The method shows promise for +complex systems, enabling effective enhanced sampling without requiring +pre-defined reaction coordinates or input features. + +
+
+
+
+
+ + ☆ RaggeDi: Diffusion-based State Estimation of Disordered Rags, Sheets, + Towels and Blankets + + +
+ Cloth state estimation is an important problem in robotics. It is essential +for the robot to know the accurate state to manipulate cloth and execute tasks +such as robotic dressing, stitching, and covering/uncovering human beings. +However, estimating cloth state accurately remains challenging due to its high +flexibility and self-occlusion. This paper proposes a diffusion model-based +pipeline that formulates the cloth state estimation as an image generation +problem by representing the cloth state as an RGB image that describes the +point-wise translation (translation map) between a pre-defined flattened mesh +and the deformed mesh in a canonical space. Then we train a conditional +diffusion-based image generation model to predict the translation map based on +an observation. Experiments are conducted in both simulation and the real world +to validate the performance of our method. Results indicate that our method +outperforms two recent methods in both accuracy and speed. + +
+
+
+
+
+ + ☆ Optimizing Job Shop Scheduling in the Furniture Industry: A + Reinforcement Learning Approach Considering Machine Setup, Batch Variability, + and Intralogistics + + +
+ This paper explores the potential application of Deep Reinforcement Learning +in the furniture industry. To offer a broad product portfolio, most furniture +manufacturers are organized as a job shop, which ultimately results in the Job +Shop Scheduling Problem (JSSP). The JSSP is addressed with a focus on extending +traditional models to better represent the complexities of real-world +production environments. Existing approaches frequently fail to consider +critical factors such as machine setup times or varying batch sizes. A concept +for a model is proposed that provides a higher level of information detail to +enhance scheduling accuracy and efficiency. The concept introduces the +integration of DRL for production planning, particularly suited to batch +production industries such as the furniture industry. The model extends +traditional approaches to JSSPs by including job volumes, buffer management, +transportation times, and machine setup times. This enables more precise +forecasting and analysis of production flows and processes, accommodating the +variability and complexity inherent in real-world manufacturing processes. The +RL agent learns to optimize scheduling decisions. It operates within a discrete +action space, making decisions based on detailed observations. A reward +function guides the agent's decision-making process, thereby promoting +efficient scheduling and meeting production deadlines. Two integration +strategies for implementing the RL agent are discussed: episodic planning, +which is suitable for low-automation environments, and continuous planning, +which is ideal for highly automated plants. While episodic planning can be +employed as a standalone solution, the continuous planning approach +necessitates the integration of the agent with ERP and Manufacturing Execution +Systems. This integration enables real-time adjustments to production schedules +based on dynamic changes. + +
+
+ comment: 18 pages, 8 pages +
+
+
+
+
+ + ☆ Accelerating the Training and Improving the Reliability of + Machine-Learned Interatomic Potentials for Strongly Anharmonic Materials + through Active Learning + + +
+ Molecular dynamics (MD) employing machine-learned interatomic potentials +(MLIPs) serve as an efficient, urgently needed complement to ab initio +molecular dynamics (aiMD). By training these potentials on data generated from +ab initio methods, their averaged predictions can exhibit comparable +performance to ab initio methods at a fraction of the cost. However, +insufficient training sets might lead to an improper description of the +dynamics in strongly anharmonic materials, because critical effects might be +overlooked in relevant cases, or only incorrectly captured, or hallucinated by +the MLIP when they are not actually present. In this work, we show that an +active learning scheme that combines MD with MLIPs (MLIP-MD) and uncertainty +estimates can avoid such problematic predictions. In short, efficient MLIP-MD +is used to explore configuration space quickly, whereby an acquisition function +based on uncertainty estimates and on energetic viability is employed to +maximize the value of the newly generated data and to focus on the most +unfamiliar but reasonably accessible regions of phase space. To verify our +methodology, we screen over 112 materials and identify 10 examples experiencing +the aforementioned problems. Using CuI and AgGaSe$_2$ as archetypes for these +problematic materials, we discuss the physical implications for strongly +anharmonic effects and demonstrate how the developed active learning scheme can +address these issues. + +
+
+ comment: 15 pages, 13 figures +
+
+
+
+
+ + ☆ Constraint Guided AutoEncoders for Joint Optimization of Condition + Indicator Estimation and Anomaly Detection in Machine Condition Monitoring + + +
+ The main goal of machine condition monitoring is, as the name implies, to +monitor the condition of industrial applications. The objective of this +monitoring can be mainly split into two problems. A diagnostic problem, where +normal data should be distinguished from anomalous data, otherwise called +Anomaly Detection (AD), or a prognostic problem, where the aim is to predict +the evolution of a Condition Indicator (CI) that reflects the condition of an +asset throughout its life time. When considering machine condition monitoring, +it is expected that this CI shows a monotonic behavior, as the condition of a +machine gradually degrades over time. This work proposes an extension to +Constraint Guided AutoEncoders (CGAE), which is a robust AD method, that +enables building a single model that can be used for both AD and CI estimation. +For the purpose of improved CI estimation the extension incorporates a +constraint that enforces the model to have monotonically increasing CI +predictions over time. Experimental results indicate that the proposed +algorithm performs similar, or slightly better, than CGAE, with regards to AD, +while improving the monotonic behavior of the CI. + +
+
+ comment: 32 pages, 7 figures, 4 tables +
+
+
+
+
+ + ☆ The Factuality of Large Language Models in the Legal Domain CIKM 2024 + + +
+ This paper investigates the factuality of large language models (LLMs) as +knowledge bases in the legal domain, in a realistic usage scenario: we allow +for acceptable variations in the answer, and let the model abstain from +answering when uncertain. First, we design a dataset of diverse factual +questions about case law and legislation. We then use the dataset to evaluate +several LLMs under different evaluation methods, including exact, alias, and +fuzzy matching. Our results show that the performance improves significantly +under the alias and fuzzy matching methods. Further, we explore the impact of +abstaining and in-context examples, finding that both strategies enhance +precision. Finally, we demonstrate that additional pre-training on legal +documents, as seen with SaulLM, further improves factual precision from 63% to +81%. + +
+
+ comment: CIKM 2024, short paper +
+
+
+
+
+ + ☆ Symmetry-Based Structured Matrices for Efficient Approximately + Equivariant Networks + + +
+ There has been much recent interest in designing symmetry-aware neural +networks (NNs) exhibiting relaxed equivariance. Such NNs aim to interpolate +between being exactly equivariant and being fully flexible, affording +consistent performance benefits. In a separate line of work, certain structured +parameter matrices -- those with displacement structure, characterized by low +displacement rank (LDR) -- have been used to design small-footprint NNs. +Displacement structure enables fast function and gradient evaluation, but +permits accurate approximations via compression primarily to classical +convolutional neural networks (CNNs). In this work, we propose a general +framework -- based on a novel construction of symmetry-based structured +matrices -- to build approximately equivariant NNs with significantly reduced +parameter counts. Our framework integrates the two aforementioned lines of work +via the use of so-called Group Matrices (GMs), a forgotten precursor to the +modern notion of regular representations of finite groups. GMs allow the design +of structured matrices -- resembling LDR matrices -- which generalize the +linear operations of a classical CNN from cyclic groups to general finite +groups and their homogeneous spaces. We show that GMs can be employed to extend +all the elementary operations of CNNs to general discrete groups. Further, the +theory of structured matrices based on GMs provides a generalization of LDR +theory focussed on matrices with cyclic structure, providing a tool for +implementing approximate equivariance for discrete groups. We test GM-based +architectures on a variety of tasks in the presence of relaxed symmetry. We +report that our framework consistently performs competitively compared to +approximately equivariant NNs, and other structured matrix-based compression +frameworks, sometimes with a one or two orders of magnitude lower parameter +count. + +
+
+ comment: 20 pages +
+
+
+
+
+ + ☆ Consistent Estimation of a Class of Distances Between Covariance + Matrices + + +
+ This work considers the problem of estimating the distance between two +covariance matrices directly from the data. Particularly, we are interested in +the family of distances that can be expressed as sums of traces of functions +that are separately applied to each covariance matrix. This family of distances +is particularly useful as it takes into consideration the fact that covariance +matrices lie in the Riemannian manifold of positive definite matrices, thereby +including a variety of commonly used metrics, such as the Euclidean distance, +Jeffreys' divergence, and the log-Euclidean distance. Moreover, a statistical +analysis of the asymptotic behavior of this class of distance estimators has +also been conducted. Specifically, we present a central limit theorem that +establishes the asymptotic Gaussianity of these estimators and provides closed +form expressions for the corresponding means and variances. Empirical +evaluations demonstrate the superiority of our proposed consistent estimator +over conventional plug-in estimators in multivariate analytical contexts. +Additionally, the central limit theorem derived in this study provides a robust +statistical framework to assess of accuracy of these estimators. + +
+
+
+
+
+ + ☆ NPAT Null-Space Projected Adversarial Training Towards Zero + Deterioration + + +
+ To mitigate the susceptibility of neural networks to adversarial attacks, +adversarial training has emerged as a prevalent and effective defense strategy. +Intrinsically, this countermeasure incurs a trade-off, as it sacrifices the +model's accuracy in processing normal samples. To reconcile the trade-off, we +pioneer the incorporation of null-space projection into adversarial training +and propose two innovative Null-space Projection based Adversarial +Training(NPAT) algorithms tackling sample generation and gradient optimization, +named Null-space Projected Data Augmentation (NPDA) and Null-space Projected +Gradient Descent (NPGD), to search for an overarching optimal solutions, which +enhance robustness with almost zero deterioration in generalization +performance. Adversarial samples and perturbations are constrained within the +null-space of the decision boundary utilizing a closed-form null-space +projector, effectively mitigating threat of attack stemming from unreliable +features. Subsequently, we conducted experiments on the CIFAR10 and SVHN +datasets and reveal that our methodology can seamlessly combine with +adversarial training methods and obtain comparable robustness while keeping +generalization close to a high-accuracy model. + +
+
+
+
+
+ + ☆ HARP: Human-Assisted Regrouping with Permutation Invariant Critic for + Multi-Agent Reinforcement Learning + + +
+ Human-in-the-loop reinforcement learning integrates human expertise to +accelerate agent learning and provide critical guidance and feedback in complex +fields. However, many existing approaches focus on single-agent tasks and +require continuous human involvement during the training process, significantly +increasing the human workload and limiting scalability. In this paper, we +propose HARP (Human-Assisted Regrouping with Permutation Invariant Critic), a +multi-agent reinforcement learning framework designed for group-oriented tasks. +HARP integrates automatic agent regrouping with strategic human assistance +during deployment, enabling and allowing non-experts to offer effective +guidance with minimal intervention. During training, agents dynamically adjust +their groupings to optimize collaborative task completion. When deployed, they +actively seek human assistance and utilize the Permutation Invariant Group +Critic to evaluate and refine human-proposed groupings, allowing non-expert +users to contribute valuable suggestions. In multiple collaboration scenarios, +our approach is able to leverage limited guidance from non-experts and enhance +performance. The project can be found at https://github.com/huawen-hu/HARP. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ From exponential to finite/fixed-time stability: Applications to + optimization + + +
+ The development of finite/fixed-time stable optimization algorithms typically +involves study of specific problem instances. The lack of a unified framework +hinders understanding of more sophisticated algorithms, e.g., primal-dual +gradient flow dynamics. The purpose of this paper is to address the following +question: Given an exponentially stable optimization algorithm, can it be +modified to obtain a finite/fixed-time stable algorithm? We provide an +affirmative answer, demonstrate how the solution can be computed on a +finite-time interval via a simple scaling of the right-hand-side of the +original dynamics, and certify the desired properties of the modified algorithm +using the Lyapunov function that proves exponential stability of the original +system. Finally, we examine nonsmooth composite optimization problems and +smooth problems with linear constraints to demonstrate the merits of our +approach. + +
+
+ comment: 6 pages; 1 figure +
+
+
+
+
+ + ☆ From Lists to Emojis: How Format Bias Affects Model Alignment + + +
+ In this paper, we study format biases in reinforcement learning from human +feedback (RLHF). We observe that many widely-used preference models, including +human evaluators, GPT-4, and top-ranking models on the RewardBench benchmark, +exhibit strong biases towards specific format patterns, such as lists, links, +bold text, and emojis. Furthermore, large language models (LLMs) can exploit +these biases to achieve higher rankings on popular benchmarks like AlpacaEval +and LMSYS Chatbot Arena. One notable example of this is verbosity bias, where +current preference models favor longer responses that appear more +comprehensive, even when their quality is equal to or lower than shorter, +competing responses. However, format biases beyond verbosity remain largely +underexplored in the literature. In this work, we extend the study of biases in +preference learning beyond the commonly recognized length bias, offering a +comprehensive analysis of a wider range of format biases. Additionally, we show +that with a small amount of biased data (less than 1%), we can inject +significant bias into the reward model. Moreover, these format biases can also +be easily exploited by downstream alignment algorithms, such as best-of-n +sampling and online iterative DPO, as it is usually easier to manipulate the +format than to improve the quality of responses. Our findings emphasize the +need to disentangle format and content both for designing alignment algorithms +and evaluating models. + +
+
+ comment: Working in progress +
+
+
+
+
+ + ☆ Monomial Matrix Group Equivariant Neural Functional Networks + + +
+ Neural functional networks (NFNs) have recently gained significant attention +due to their diverse applications, ranging from predicting network +generalization and network editing to classifying implicit neural +representation. Previous NFN designs often depend on permutation symmetries in +neural networks' weights, which traditionally arise from the unordered +arrangement of neurons in hidden layers. However, these designs do not take +into account the weight scaling symmetries of $\operatorname{ReLU}$ networks, +and the weight sign flipping symmetries of $\operatorname{sin}$ or +$\operatorname{tanh}$ networks. In this paper, we extend the study of the group +action on the network weights from the group of permutation matrices to the +group of monomial matrices by incorporating scaling/sign-flipping symmetries. +Particularly, we encode these scaling/sign-flipping symmetries by designing our +corresponding equivariant and invariant layers. We name our new family of NFNs +the Monomial Matrix Group Equivariant Neural Functional Networks +(Monomial-NFN). Because of the expansion of the symmetries, Monomial-NFN has +much fewer independent trainable parameters compared to the baseline NFNs in +the literature, thus enhancing the model's efficiency. Moreover, for fully +connected and convolutional neural networks, we theoretically prove that all +groups that leave these networks invariant while acting on their weight spaces +are some subgroups of the monomial matrix group. We provide empirical evidences +to demonstrate the advantages of our model over existing baselines, achieving +competitive performance and efficiency. + +
+
+
+
+
+ + ☆ Detecting Underdiagnosed Medical Conditions with Deep Learning-Based + Opportunistic CT Imaging + + +
+ Abdominal computed tomography (CT) scans are frequently performed in clinical +settings. Opportunistic CT involves repurposing routine CT images to extract +diagnostic information and is an emerging tool for detecting underdiagnosed +conditions such as sarcopenia, hepatic steatosis, and ascites. This study +utilizes deep learning methods to promote accurate diagnosis and clinical +documentation. We analyze 2,674 inpatient CT scans to identify discrepancies +between imaging phenotypes (characteristics derived from opportunistic CT +scans) and their corresponding documentation in radiology reports and ICD +coding. Through our analysis, we find that only 0.5%, 3.2%, and 30.7% of scans +diagnosed with sarcopenia, hepatic steatosis, and ascites (respectively) +through either opportunistic imaging or radiology reports were ICD-coded. Our +findings demonstrate opportunistic CT's potential to enhance diagnostic +precision and accuracy of risk adjustment models, offering advancements in +precision medicine. + +
+
+
+
+
+ + ☆ Recurrent Interpolants for Probabilistic Time Series Prediction + + +
+ Sequential models such as recurrent neural networks or transformer-based +models became \textit{de facto} tools for multivariate time series forecasting +in a probabilistic fashion, with applications to a wide range of datasets, such +as finance, biology, medicine, etc. Despite their adeptness in capturing +dependencies, assessing prediction uncertainty, and efficiency in training, +challenges emerge in modeling high-dimensional complex distributions and +cross-feature dependencies. To tackle these issues, recent works delve into +generative modeling by employing diffusion or flow-based models. Notably, the +integration of stochastic differential equations or probability flow +successfully extends these methods to probabilistic time series imputation and +forecasting. However, scalability issues necessitate a computational-friendly +framework for large-scale generative model-based predictions. This work +proposes a novel approach by blending the computational efficiency of recurrent +neural networks with the high-quality probabilistic modeling of the diffusion +model, which addresses challenges and advances generative models' application +in time series forecasting. Our method relies on the foundation of stochastic +interpolants and the extension to a broader conditional generation framework +with additional control features, offering insights for future developments in +this dynamic field. + +
+
+
+
+
+ + ☆ An Enhanced-State Reinforcement Learning Algorithm for Multi-Task Fusion + in Large-Scale Recommender Systems + + +
+ As the last key stage of Recommender Systems (RSs), Multi-Task Fusion (MTF) +is in charge of combining multiple scores predicted by Multi-Task Learning +(MTL) into a final score to maximize user satisfaction, which decides the +ultimate recommendation results. In recent years, to maximize long-term user +satisfaction within a recommendation session, Reinforcement Learning (RL) is +widely used for MTF in large-scale RSs. However, limited by their modeling +pattern, all the current RL-MTF methods can only utilize user features as the +state to generate actions for each user, but unable to make use of item +features and other valuable features, which leads to suboptimal results. +Addressing this problem is a challenge that requires breaking through the +current modeling pattern of RL-MTF. To solve this problem, we propose a novel +method called Enhanced-State RL for MTF in RSs. Unlike the existing methods +mentioned above, our method first defines user features, item features, and +other valuable features collectively as the enhanced state; then proposes a +novel actor and critic learning process to utilize the enhanced state to make +much better action for each user-item pair. To the best of our knowledge, this +novel modeling pattern is being proposed for the first time in the field of +RL-MTF. We conduct extensive offline and online experiments in a large-scale +RS. The results demonstrate that our model outperforms other models +significantly. Enhanced-State RL has been fully deployed in our RS more than +half a year, improving +3.84% user valid consumption and +0.58% user duration +time compared to baseline. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2404.17589 +
+
+
+
+
+ + ☆ Hypergraph-based Motion Generation with Multi-modal Interaction + Relational Reasoning + + +
+ The intricate nature of real-world driving environments, characterized by +dynamic and diverse interactions among multiple vehicles and their possible +future states, presents considerable challenges in accurately predicting the +motion states of vehicles and handling the uncertainty inherent in the +predictions. Addressing these challenges requires comprehensive modeling and +reasoning to capture the implicit relations among vehicles and the +corresponding diverse behaviors. This research introduces an integrated +framework for autonomous vehicles (AVs) motion prediction to address these +complexities, utilizing a novel Relational Hypergraph Interaction-informed +Neural mOtion generator (RHINO). RHINO leverages hypergraph-based relational +reasoning by integrating a multi-scale hypergraph neural network to model +group-wise interactions among multiple vehicles and their multi-modal driving +behaviors, thereby enhancing motion prediction accuracy and reliability. +Experimental validation using real-world datasets demonstrates the superior +performance of this framework in improving predictive accuracy and fostering +socially aware automated driving in dynamic traffic scenarios. + +
+
+
+
+
+ + ☆ Few-Shot Class-Incremental Learning with Non-IID Decentralized Data + + +
+ Few-shot class-incremental learning is crucial for developing scalable and +adaptive intelligent systems, as it enables models to acquire new classes with +minimal annotated data while safeguarding the previously accumulated knowledge. +Nonetheless, existing methods deal with continuous data streams in a +centralized manner, limiting their applicability in scenarios that prioritize +data privacy and security. To this end, this paper introduces federated +few-shot class-incremental learning, a decentralized machine learning paradigm +tailored to progressively learn new classes from scarce data distributed across +multiple clients. In this learning paradigm, clients locally update their +models with new classes while preserving data privacy, and then transmit the +model updates to a central server where they are aggregated globally. However, +this paradigm faces several issues, such as difficulties in few-shot learning, +catastrophic forgetting, and data heterogeneity. To address these challenges, +we present a synthetic data-driven framework that leverages replay buffer data +to maintain existing knowledge and facilitate the acquisition of new knowledge. +Within this framework, a noise-aware generative replay module is developed to +fine-tune local models with a balance of new and replay data, while generating +synthetic data of new classes to further expand the replay buffer for future +tasks. Furthermore, a class-specific weighted aggregation strategy is designed +to tackle data heterogeneity by adaptively aggregating class-specific +parameters based on local models performance on synthetic data. This enables +effective global model optimization without direct access to client data. +Comprehensive experiments across three widely-used datasets underscore the +effectiveness and preeminence of the introduced framework. + +
+
+
+
+
+ + ☆ How to Build the Virtual Cell with Artificial Intelligence: Priorities + and Opportunities + + +
+ The cell is arguably the smallest unit of life and is central to +understanding biology. Accurate modeling of cells is important for this +understanding as well as for determining the root causes of disease. Recent +advances in artificial intelligence (AI), combined with the ability to generate +large-scale experimental data, present novel opportunities to model cells. Here +we propose a vision of AI-powered Virtual Cells, where robust representations +of cells and cellular systems under different conditions are directly learned +from growing biological data across measurements and scales. We discuss desired +capabilities of AI Virtual Cells, including generating universal +representations of biological entities across scales, and facilitating +interpretable in silico experiments to predict and understand their behavior +using Virtual Instruments. We further address the challenges, opportunities and +requirements to realize this vision including data needs, evaluation +strategies, and community standards and engagement to ensure biological +accuracy and broad utility. We envision a future where AI Virtual Cells help +identify new drug targets, predict cellular responses to perturbations, as well +as scale hypothesis exploration. With open science collaborations across the +biomedical ecosystem that includes academia, philanthropy, and the biopharma +and AI industries, a comprehensive predictive understanding of cell mechanisms +and interactions is within reach. + +
+
+
+
+
+ + ☆ Enhancing Semi-Supervised Learning via Representative and Diverse Sample + Selection + + +
+ Semi-Supervised Learning (SSL) has become a preferred paradigm in many deep +learning tasks, which reduces the need for human labor. Previous studies +primarily focus on effectively utilising the labelled and unlabeled data to +improve performance. However, we observe that how to select samples for +labelling also significantly impacts performance, particularly under extremely +low-budget settings. The sample selection task in SSL has been under-explored +for a long time. To fill in this gap, we propose a Representative and Diverse +Sample Selection approach (RDSS). By adopting a modified Frank-Wolfe algorithm +to minimise a novel criterion $\alpha$-Maximum Mean Discrepancy ($\alpha$-MMD), +RDSS samples a representative and diverse subset for annotation from the +unlabeled data. We demonstrate that minimizing $\alpha$-MMD enhances the +generalization ability of low-budget learning. Experimental results show that +RDSS consistently improves the performance of several popular SSL frameworks +and outperforms the state-of-the-art sample selection approaches used in Active +Learning (AL) and Semi-Supervised Active Learning (SSAL), even with constrained +annotation budgets. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Art and Science of Quantizing Large-Scale Models: A Comprehensive + Overview + + +
+ This paper provides a comprehensive overview of the principles, challenges, +and methodologies associated with quantizing large-scale neural network models. +As neural networks have evolved towards larger and more complex architectures +to address increasingly sophisticated tasks, the computational and energy costs +have escalated significantly. We explore the necessity and impact of model size +growth, highlighting the performance benefits as well as the computational +challenges and environmental considerations. The core focus is on model +quantization as a fundamental approach to mitigate these challenges by reducing +model size and improving efficiency without substantially compromising +accuracy. We delve into various quantization techniques, including both +post-training quantization (PTQ) and quantization-aware training (QAT), and +analyze several state-of-the-art algorithms such as LLM-QAT, PEQA(L4Q), +ZeroQuant, SmoothQuant, and others. Through comparative analysis, we examine +how these methods address issues like outliers, importance weighting, and +activation quantization, ultimately contributing to more sustainable and +accessible deployment of large-scale models. + +
+
+
+
+
+ + ☆ Hard-Label Cryptanalytic Extraction of Neural Network Models + + +
+ The machine learning problem of extracting neural network parameters has been +proposed for nearly three decades. Functionally equivalent extraction is a +crucial goal for research on this problem. When the adversary has access to the +raw output of neural networks, various attacks, including those presented at +CRYPTO 2020 and EUROCRYPT 2024, have successfully achieved this goal. However, +this goal is not achieved when neural networks operate under a hard-label +setting where the raw output is inaccessible. + In this paper, we propose the first attack that theoretically achieves +functionally equivalent extraction under the hard-label setting, which applies +to ReLU neural networks. The effectiveness of our attack is validated through +practical experiments on a wide range of ReLU neural networks, including neural +networks trained on two real benchmarking datasets (MNIST, CIFAR10) widely used +in computer vision. For a neural network consisting of $10^5$ parameters, our +attack only requires several hours on a single core. + +
+
+ comment: Accepted by Asiacrypt 2024 +
+
+
+
+
+ + ☆ DAF-Net: A Dual-Branch Feature Decomposition Fusion Network with Domain + Adaptive for Infrared and Visible Image Fusion + + +
+ Infrared and visible image fusion aims to combine complementary information +from both modalities to provide a more comprehensive scene understanding. +However, due to the significant differences between the two modalities, +preserving key features during the fusion process remains a challenge. To +address this issue, we propose a dual-branch feature decomposition fusion +network (DAF-Net) with domain adaptive, which introduces Multi-Kernel Maximum +Mean Discrepancy (MK-MMD) into the base encoder and designs a hybrid kernel +function suitable for infrared and visible image fusion. The base encoder built +on the Restormer network captures global structural information while the +detail encoder based on Invertible Neural Networks (INN) focuses on extracting +detail texture information. By incorporating MK-MMD, the DAF-Net effectively +aligns the latent feature spaces of visible and infrared images, thereby +improving the quality of the fused images. Experimental results demonstrate +that the proposed method outperforms existing techniques across multiple +datasets, significantly enhancing both visual quality and fusion performance. +The related Python code is available at https://github.com/xujian000/DAF-Net. + +
+
+ comment: 5pages,4figures +
+
+
+
+
+ + ☆ Enhancing PM2.5 Data Imputation and Prediction in Air Quality Monitoring + Networks Using a KNN-SINDy Hybrid Model + + +
+ Air pollution, particularly particulate matter (PM2.5), poses significant +risks to public health and the environment, necessitating accurate prediction +and continuous monitoring for effective air quality management. However, air +quality monitoring (AQM) data often suffer from missing records due to various +technical difficulties. This study explores the application of Sparse +Identification of Nonlinear Dynamics (SINDy) for imputing missing PM2.5 data by +predicting, using training data from 2016, and comparing its performance with +the established Soft Impute (SI) and K-Nearest Neighbors (KNN) methods. + +
+
+
+
+
+ + ☆ Multimodal Generalized Category Discovery + + +
+ Generalized Category Discovery (GCD) aims to classify inputs into both known +and novel categories, a task crucial for open-world scientific discoveries. +However, current GCD methods are limited to unimodal data, overlooking the +inherently multimodal nature of most real-world data. In this work, we extend +GCD to a multimodal setting, where inputs from different modalities provide +richer and complementary information. Through theoretical analysis and +empirical validation, we identify that the key challenge in multimodal GCD lies +in effectively aligning heterogeneous information across modalities. To address +this, we propose MM-GCD, a novel framework that aligns both the feature and +output spaces of different modalities using contrastive learning and +distillation techniques. MM-GCD achieves new state-of-the-art performance on +the UPMC-Food101 and N24News datasets, surpassing previous methods by 11.5\% +and 4.7\%, respectively. + +
+
+
+
+
+ + ☆ PieClam: A Universal Graph Autoencoder Based on Overlapping Inclusive + and Exclusive Communities + + +
+ We propose PieClam (Prior Inclusive Exclusive Cluster Affiliation Model): a +probabilistic graph model for representing any graph as overlapping generalized +communities. Our method can be interpreted as a graph autoencoder: nodes are +embedded into a code space by an algorithm that maximizes the log-likelihood of +the decoded graph, given the input graph. PieClam is a community affiliation +model that extends well-known methods like BigClam in two main manners. First, +instead of the decoder being defined via pairwise interactions between the +nodes in the code space, we also incorporate a learned prior on the +distribution of nodes in the code space, turning our method into a graph +generative model. Secondly, we generalize the notion of communities by allowing +not only sets of nodes with strong connectivity, which we call inclusive +communities, but also sets of nodes with strong disconnection, which we call +exclusive communities. To model both types of communities, we propose a new +type of decoder based the Lorentz inner product, which we prove to be much more +expressive than standard decoders based on standard inner products or norm +distances. By introducing a new graph similarity measure, that we call the log +cut distance, we show that PieClam is a universal autoencoder, able to +uniformly approximately reconstruct any graph. Our method is shown to obtain +competitive performance in graph anomaly detection benchmarks. + +
+
+
+
+
+ + ♻ ☆ UKAN: Unbound Kolmogorov-Arnold Network Accompanied with Accelerated + Library + + +
+ In this work, we present a GPU-accelerated library for the underlying +components of Kolmogorov-Arnold Networks (KANs), along with an algorithm to +eliminate bounded grids in KANs. The GPU-accelerated library reduces the +computational complexity of Basis Spline (B-spline) evaluation by a factor of +$\mathcal{O}$(grid size) compared to existing codes, enabling batch computation +for large-scale learning. To overcome the limitations of traditional KANs, we +introduce Unbounded KANs (UKANs), which eliminate the need for a bounded grid +and a fixed number of B-spline coefficients. To do so, we replace the KAN +parameters (B-spline coefficients) with a coefficient generator (CG) model. The +inputs to the CG model are designed based on the idea of an infinite symmetric +grid extending from negative infinity to positive infinity. The positional +encoding of grid group, a sequential collection of B-spline grid indexes, is +fed into the CG model, and coefficients are consumed by the efficient +implementation (matrix representations) of B-spline functions to generate +outputs. We perform several experiments on regression, classification, and +generative tasks, which are promising. In particular, UKAN does not require +data normalization or a bounded domain for evaluation. Additionally, our +benchmarking results indicate the superior memory and computational efficiency +of our library compared to existing codes. + +
+
+ comment: 10 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Machine Learning Approaches for Diagnostics and Prognostics of + Industrial Systems Using Open Source Data from PHM Data Challenges: A Review + + +
+ In the field of Prognostics and Health Management (PHM), recent years have +witnessed a significant surge in the application of machine learning (ML). +Despite this growth, the field grapples with a lack of unified guidelines and +systematic approaches for effectively implementing these ML techniques and +comprehensive analysis regarding industrial open-source data across varied +scenarios. To address these gaps, this paper provides a comprehensive review of +ML approaches for diagnostics and prognostics of industrial systems using +open-source datasets from PHM Data Challenge Competitions held between 2018 and +2023 by PHM Society and IEEE Reliability Society and summarizes a unified ML +framework. This review systematically categorizes and scrutinizes the problems, +challenges, methodologies, and advancements demonstrated in these competitions, +highlighting the evolving role of both conventional machine learning and deep +learning in tackling complex industrial tasks related to detection, diagnosis, +assessment, and prognosis. Moreover, this paper delves into the common +challenges in PHM data challenge competitions by emphasizing data-related and +model-related issues and evaluating the limitations of these competitions. The +potential solutions to address these challenges are also summarized. Finally, +we identify key themes and potential directions for future research, providing +opportunities and prospects for next-generation ML-PHM development in PHM +domain. + +
+
+ comment: The paper submitted to the International Journal of Prognostics and + Health Management (IJPHM) has been accepted +
+
+
+
+
+ + ♻ ☆ Characterizing Dynamical Stability of Stochastic Gradient Descent in + Overparameterized Learning + + +
+ For overparameterized optimization tasks, such as the ones found in modern +machine learning, global minima are generally not unique. In order to +understand generalization in these settings, it is vital to study to which +minimum an optimization algorithm converges. The possibility of having minima +that are unstable under the dynamics imposed by the optimization algorithm +limits the potential minima that the algorithm can find. In this paper, we +characterize the global minima that are dynamically stable/unstable for both +deterministic and stochastic gradient descent (SGD). In particular, we +introduce a characteristic Lyapunov exponent which depends on the local +dynamics around a global minimum and rigorously prove that the sign of this +Lyapunov exponent determines whether SGD can accumulate at the respective +global minimum. + +
+
+
+
+
+ + ♻ ☆ ReflectDiffu:Reflect between Emotion-intent Contagion and Mimicry for + Empathetic Response Generation via a RL-Diffusion Framework + + +
+ Empathetic response generation necessitates the integration of emotional and +intentional dynamics to foster meaningful interactions. Existing research +either neglects the intricate interplay between emotion and intent, leading to +suboptimal controllability of empathy, or resorts to large language models +(LLMs), which incur significant computational overhead. In this paper, we +introduce ReflectDiffu, a lightweight and comprehensive framework for +empathetic response generation. This framework incorporates emotion contagion +to augment emotional expressiveness and employs an emotion-reasoning mask to +pinpoint critical emotional elements. Additionally, it integrates intent +mimicry within reinforcement learning for refinement during diffusion. By +harnessing an intent twice reflect the mechanism of +Exploring-Sampling-Correcting, ReflectDiffu adeptly translates emotional +decision-making into precise intent actions, thereby addressing empathetic +response misalignments stemming from emotional misrecognition. Through +reflection, the framework maps emotional states to intents, markedly enhancing +both response empathy and flexibility. Comprehensive experiments reveal that +ReflectDiffu outperforms existing models regarding relevance, controllability, +and informativeness, achieving state-of-the-art results in both automatic and +human evaluations. + +
+
+
+
+
+ + ♻ ☆ TK-Planes: Tiered K-Planes with High Dimensional Feature Vectors for + Dynamic UAV-based Scenes ICRA2025 + + +
+ In this paper, we present a new approach to bridge the domain gap between +synthetic and real-world data for unmanned aerial vehicle (UAV)-based +perception. Our formulation is designed for dynamic scenes, consisting of small +moving objects or human actions. We propose an extension of K-Planes Neural +Radiance Field (NeRF), wherein our algorithm stores a set of tiered feature +vectors. The tiered feature vectors are generated to effectively model +conceptual information about a scene as well as an image decoder that +transforms output feature maps into RGB images. Our technique leverages the +information amongst both static and dynamic objects within a scene and is able +to capture salient scene attributes of high altitude videos. We evaluate its +performance on challenging datasets, including Okutama Action and UG2, and +observe considerable improvement in accuracy over state of the art neural +rendering methods. + +
+
+ comment: 8 pages, submitted to ICRA2025 +
+
+
+
+
+ + ♻ ☆ Model-free quantification of completeness, uncertainties, and outliers + in atomistic machine learning using information theory + + +
+ An accurate description of information is relevant for a range of problems in +atomistic machine learning (ML), such as crafting training sets, performing +uncertainty quantification (UQ), or extracting physical insights from large +datasets. However, atomistic ML often relies on unsupervised learning or model +predictions to analyze information contents from simulation or training data. +Here, we introduce a theoretical framework that provides a rigorous, model-free +tool to quantify information contents in atomistic simulations. We demonstrate +that the information entropy of a distribution of atom-centered environments +explains known heuristics in ML potential developments, from training set sizes +to dataset optimality. Using this tool, we propose a model-free UQ method that +reliably predicts epistemic uncertainty and detects out-of-distribution +samples, including rare events in systems such as nucleation. This method +provides a general tool for data-driven atomistic modeling and combines efforts +in ML, simulations, and physical explainability. + +
+
+ comment: v2.0 +
+
+
+
+
+ + ♻ ☆ EHRFL: Federated Learning Framework for Institution-Specific Model + Construction using Electronic Health Records + + +
+ The increasing volume of electronic health records (EHRs) across healthcare +institutions presents the opportunity to enhance model accuracy and robustness +in clinical prediction tasks. Federated learning enables training on data from +multiple institutions while preserving patient privacy and complying to +regulatory constraints. However, most federated learning research focuses on +constructing a global model for multiple clients, overlooking the practical +need for institution-specific models. In this work, we introduce EHRFL, a +federated learning framework using EHRs designed to develop a model tailored to +a single healthcare institution. Our framework addresses two key challenges: +(1) enabling federated learning across institutions with heterogeneous EHR +systems using text-based EHR modeling, and (2) reducing the costs associated +with federated learning by selecting suitable participating clients using +averaged patient embeddings, which enables optimizing the number of +participants without compromising model performance for the institution. Our +experiment results on multiple open-source EHR datasets demonstrate the +effectiveness of EHRFL in addressing the two challenges, establishing it as a +practical solution for institution-specific model development in federated +learning. + +
+
+
+
+
+ + ♻ ☆ Adaptive Step Sizes for Preconditioned Stochastic Gradient Descent + + +
+ This paper proposes a novel approach to adaptive step sizes in stochastic +gradient descent (SGD) by utilizing quantities that we have identified as +numerically traceable -- the Lipschitz constant for gradients and a concept of +the local variance in search directions. Our findings yield a nearly +hyperparameter-free algorithm for stochastic optimization, which has provable +convergence properties and exhibits truly problem adaptive behavior on +classical image classification tasks. Our framework is set in a general Hilbert +space and thus enables the potential inclusion of a preconditioner through the +choice of the inner product. + +
+
+
+
+
+ + ♻ ☆ A geometric view on probabilistically robust learning + + +
+ Although deep neural networks have achieved super-human performance on many +classification tasks, they often exhibit a worrying lack of robustness towards +adversarially generated examples. Thus, considerable effort has been invested +into reformulating standard Risk Minimization (RM) into an adversarially robust +framework. Recently, attention has shifted towards approaches which interpolate +between the robustness offered by adversarial training and the higher clean +accuracy and faster training times of RM. In this paper, we take a fresh and +geometric view on one such method -- Probabilistically Robust Learning (PRL). +We propose a mathematical framework for understanding PRL, which allows us to +identify geometric pathologies in its original formulation and to introduce a +family of probabilistic nonlocal perimeter functionals to rectify them. We +prove existence of solutions to the original and modified problems using novel +relaxation methods and also study properties, as well as local limits, of the +introduced perimeters. We also clarify, through a suitable $\Gamma$-convergence +analysis, the way in which the original and modified PRL models interpolate +between risk minimization and adversarial training. + +
+
+
+
+
+ + ♻ ☆ PlaSma: Making Small Language Models Better Procedural Knowledge Models + for (Counterfactual) Planning ICLR 2024 + + +
+ Procedural planning, which entails decomposing a high-level goal into a +sequence of temporally ordered steps, is an important yet intricate task for +machines. It involves integrating common-sense knowledge to reason about +complex and often contextualized situations, e.g. ``scheduling a doctor's +appointment without a phone''. While current approaches show encouraging +results using large language models (LLMs), they are hindered by drawbacks such +as costly API calls and reproducibility issues. In this paper, we advocate +planning using smaller language models. We present PlaSma, a novel two-pronged +approach to endow small language models with procedural knowledge and +(constrained) language planning capabilities. More concretely, we develop +symbolic procedural knowledge distillation to enhance the commonsense knowledge +in small language models and an inference-time algorithm to facilitate more +structured and accurate reasoning. In addition, we introduce a new related +task, Replanning, that requires a revision of a plan to cope with a constrained +situation. In both the planning and replanning settings, we show that +orders-of-magnitude smaller models (770M-11B parameters) can compete and often +surpass their larger teacher models' capabilities. Finally, we showcase +successful application of PlaSma in an embodied environment, VirtualHome. + +
+
+ comment: ICLR 2024 version , 31 pages +
+
+
+
+
+ + ♻ ☆ A New Era in Computational Pathology: A Survey on Foundation and + Vision-Language Models + + +
+ Recent advances in deep learning have completely transformed the domain of +computational pathology (CPath). More specifically, it has altered the +diagnostic workflow of pathologists by integrating foundation models (FMs) and +vision-language models (VLMs) in their assessment and decision-making process. +The limitations of existing deep learning approaches in CPath can be overcome +by FMs through learning a representation space that can be adapted to a wide +variety of downstream tasks without explicit supervision. Deploying VLMs allow +pathology reports written in natural language be used as rich semantic +information sources to improve existing models as well as generate predictions +in natural language form. In this survey, a holistic and systematic overview of +recent innovations in FMs and VLMs in CPath is presented. Furthermore, the +tools, datasets and training schemes for these models are summarized in +addition to categorizing them into distinct groups. This extensive survey +highlights the current trends in CPath and its possible revolution through the +use of FMs and VLMs in the future. + +
+
+ comment: 20 pages, 19 figures and 9 tables +
+
+
+
+
+ + ♻ ☆ Operational Wind Speed Forecasts for Chile's Electric Power Sector Using + a Hybrid ML Model + + +
+ As Chile's electric power sector advances toward a future powered by +renewable energy, accurate forecasting of renewable generation is essential for +managing grid operations. The integration of renewable energy sources is +particularly challenging due to the operational difficulties of managing their +power generation, which is highly variable compared to fossil fuel sources, +delaying the availability of clean energy. To mitigate this, we quantify the +impact of increasing intermittent generation from wind and solar on thermal +power plants in Chile and introduce a hybrid wind speed forecasting methodology +which combines two custom ML models for Chile. The first model is based on +TiDE, an MLP-based ML model for short-term forecasts, and the second is based +on a graph neural network, GraphCast, for medium-term forecasts up to 10 days. +Our hybrid approach outperforms the most accurate operational deterministic +systems by 4-21% for short-term forecasts and 5-23% for medium-term forecasts +and can directly lower the impact of wind generation on thermal ramping, +curtailment, and system-level emissions in Chile. + +
+
+
+
+
+ + ♻ ☆ A Fisher-Rao gradient flow for entropic mean-field min-max games + + +
+ Gradient flows play a substantial role in addressing many machine learning +problems. We examine the convergence in continuous-time of a +\textit{Fisher-Rao} (Mean-Field Birth-Death) gradient flow in the context of +solving convex-concave min-max games with entropy regularization. We propose +appropriate Lyapunov functions to demonstrate convergence with explicit rates +to the unique mixed Nash equilibrium. + +
+
+ comment: 24 pages. arXiv admin note: text overlap with arXiv:2306.03033 +
+
+
+
+
+ + ♻ ☆ Hybrid Top-Down Global Causal Discovery with Local Search for Linear and + Nonlinear Additive Noise Models + + +
+ Learning the unique directed acyclic graph corresponding to an unknown causal +model is a challenging task. Methods based on functional causal models can +identify a unique graph, but either suffer from the curse of dimensionality or +impose strong parametric assumptions. To address these challenges, we propose a +novel hybrid approach for global causal discovery in observational data that +leverages local causal substructures. We first present a topological sorting +algorithm that leverages ancestral relationships in linear structural equation +models to establish a compact top-down hierarchical ordering, encoding more +causal information than linear orderings produced by existing methods. We +demonstrate that this approach generalizes to nonlinear settings with arbitrary +noise. We then introduce a nonparametric constraint-based algorithm that prunes +spurious edges by searching for local conditioning sets, achieving greater +accuracy than current methods. We provide theoretical guarantees for +correctness and worst-case polynomial time complexities, with empirical +validation on synthetic data. + +
+
+
+
+
+ + ♻ ☆ Neural Graph Generator: Feature-Conditioned Graph Generation using + Latent Diffusion Models + + +
+ Graph generation has emerged as a crucial task in machine learning, with +significant challenges in generating graphs that accurately reflect specific +properties. Existing methods often fall short in efficiently addressing this +need as they struggle with the high-dimensional complexity and varied nature of +graph properties. In this paper, we introduce the Neural Graph Generator (NGG), +a novel approach which utilizes conditioned latent diffusion models for graph +generation. NGG demonstrates a remarkable capacity to model complex graph +patterns, offering control over the graph generation process. NGG employs a +variational graph autoencoder for graph compression and a diffusion process in +the latent vector space, guided by vectors summarizing graph statistics. We +demonstrate NGG's versatility across various graph generation tasks, showing +its capability to capture desired graph properties and generalize to unseen +graphs. We also compare our generator to the graph generation capabilities of +different LLMs. This work signifies a shift in graph generation methodologies, +offering a more practical and efficient solution for generating diverse graphs +with specific characteristics. + +
+
+
+
+
+ + ♻ ☆ Shapley-PC: Constraint-based Causal Structure Learning with Shapley + Values + + +
+ Causal Structure Learning (CSL), also referred to as causal discovery, +amounts to extracting causal relations among variables in data. CSL enables the +estimation of causal effects from observational data alone, avoiding the need +to perform real life experiments. Constraint-based CSL leverages conditional +independence tests to perform causal discovery. We propose Shapley-PC, a novel +method to improve constraint-based CSL algorithms by using Shapley values over +the possible conditioning sets, to decide which variables are responsible for +the observed conditional (in)dependences. We prove soundness, completeness and +asymptotic consistency of Shapley-PC and run a simulation study showing that +our proposed algorithm is superior to existing versions of PC. + +
+
+ comment: 21 pages (with appendix) +
+
+
+
+
+ + ♻ ☆ A Hybrid Transformer and Attention Based Recurrent Neural Network for + Robust and Interpretable Sentiment Analysis of Tweets + + +
+ Sentiment analysis is crucial for understanding public opinion and consumer +behavior. Existing models face challenges with linguistic diversity, +generalizability, and explainability. We propose TRABSA, a hybrid framework +integrating transformer-based architectures, attention mechanisms, and BiLSTM +networks to address this. Leveraging RoBERTa-trained on 124M tweets, we bridge +gaps in sentiment analysis benchmarks, ensuring state-of-the-art accuracy. +Augmenting datasets with tweets from 32 countries and US states, we compare six +word-embedding techniques and three lexicon-based labeling techniques, +selecting the best for optimal sentiment analysis. TRABSA outperforms +traditional ML and deep learning models with 94% accuracy and significant +precision, recall, and F1-score gains. Evaluation across diverse datasets +demonstrates consistent superiority and generalizability. SHAP and LIME +analyses enhance interpretability, improving confidence in predictions. Our +study facilitates pandemic resource management, aiding resource planning, +policy formation, and vaccination tactics. + +
+
+
+
+
+ + ♻ ☆ Continual Learning: Forget-free Winning Subnetworks for Video + Representations + + +
+ Inspired by the Lottery Ticket Hypothesis (LTH), which highlights the +existence of efficient subnetworks within larger, dense networks, a +high-performing Winning Subnetwork (WSN) in terms of task performance under +appropriate sparsity conditions is considered for various continual learning +tasks. It leverages pre-existing weights from dense networks to achieve +efficient learning in Task Incremental Learning (TIL) and Task-agnostic +Incremental Learning (TaIL) scenarios. In Few-Shot Class Incremental Learning +(FSCIL), a variation of WSN referred to as the Soft subnetwork (SoftNet) is +designed to prevent overfitting when the data samples are scarce. Furthermore, +the sparse reuse of WSN weights is considered for Video Incremental Learning +(VIL). The use of Fourier Subneural Operator (FSO) within WSN is considered. It +enables compact encoding of videos and identifies reusable subnetworks across +varying bandwidths. We have integrated FSO into different architectural +frameworks for continual learning, including VIL, TIL, and FSCIL. Our +comprehensive experiments demonstrate FSO's effectiveness, significantly +improving task performance at various convolutional representational levels. +Specifically, FSO enhances higher-layer performance in TIL and FSCIL and +lower-layer performance in VIL. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2303.14962, + arXiv:2306.11305 +
+
+
+
+
+ + ♻ ☆ Inverse Problems with Diffusion Models: A MAP Estimation Perspective + + +
+ Inverse problems have many applications in science and engineering. In +Computer vision, several image restoration tasks such as inpainting, +deblurring, and super-resolution can be formally modeled as inverse problems. +Recently, methods have been developed for solving inverse problems that only +leverage a pre-trained unconditional diffusion model and do not require +additional task-specific training. In such methods, however, the inherent +intractability of determining the conditional score function during the reverse +diffusion process poses a real challenge, leaving the methods to settle with an +approximation instead, which affects their performance in practice. Here, we +propose a MAP estimation framework to model the reverse conditional generation +process of a continuous time diffusion model as an optimization process of the +underlying MAP objective, whose gradient term is tractable. In theory, the +proposed framework can be applied to solve general inverse problems using +gradient-based optimization methods. However, given the highly non-convex +nature of the loss objective, finding a perfect gradient-based optimization +algorithm can be quite challenging, nevertheless, our framework offers several +potential research directions. We use our proposed formulation to develop +empirically effective algorithms for image restoration. We validate our +proposed algorithms with extensive experiments over multiple datasets across +several restoration tasks. + +
+
+
+
+
+ + ♻ ☆ Creative Beam Search: LLM-as-a-Judge For Improving Response Generation + + +
+ Large language models are revolutionizing several areas, including artificial +creativity. However, the process of generation in machines profoundly diverges +from that observed in humans. In particular, machine generation is +characterized by a lack of intentionality and an underlying creative process. +We propose a method called Creative Beam Search that uses Diverse Beam Search +and LLM-as-a-Judge to perform response generation and response validation. The +results of a qualitative experiment show how our approach can provide better +output than standard sampling techniques. We also show that the response +validation step is a necessary complement to the response generation step. + +
+
+ comment: Presented as a short paper at the 15th International Conference on + Computational Creativity (ICCC'24) +
+
+
+
+
+ + ♻ ☆ Training Foundation Models as Data Compression: On Information, Model + Weights and Copyright Law + + +
+ The training process of foundation models as for other classes of deep +learning systems is based on minimizing the reconstruction error over a +training set. For this reason, they are susceptible to the memorization and +subsequent reproduction of training samples. In this paper, we introduce a +training-as-compressing perspective, wherein the model's weights embody a +compressed representation of the training data. From a copyright standpoint, +this point of view implies that the weights could be considered a reproduction +or a derivative work of a potentially protected set of works. We investigate +the technical and legal challenges that emerge from this framing of the +copyright of outputs generated by foundation models, including their +implications for practitioners and researchers. We demonstrate that adopting an +information-centric approach to the problem presents a promising pathway for +tackling these emerging complex legal issues. + +
+
+ comment: Spotlight presentation at GenLaw'24, see + https://www.genlaw.org/2024-icml-papers#training-foundation-models-as-data-compression-on-information-model-weights-and-copyright-law +
+
+
+
+
+ + ♻ ☆ A Single-Loop Deep Actor-Critic Algorithm for Constrained Reinforcement + Learning with Provable Convergence + + +
+ Deep Actor-Critic algorithms, which combine Actor-Critic with deep neural +network (DNN), have been among the most prevalent reinforcement learning +algorithms for decision-making problems in simulated environments. However, the +existing deep Actor-Critic algorithms are still not mature to solve realistic +problems with non-convex stochastic constraints and high cost to interact with +the environment. In this paper, we propose a single-loop deep Actor-Critic +(SLDAC) algorithmic framework for general constrained reinforcement learning +(CRL) problems. In the actor step, the constrained stochastic successive convex +approximation (CSSCA) method is applied to handle the non-convex stochastic +objective and constraints. In the critic step, the critic DNNs are only updated +once or a few finite times for each iteration, which simplifies the algorithm +to a single-loop framework (the existing works require a sufficient number of +updates for the critic step to ensure a good enough convergence of the inner +loop for each iteration). Moreover, the variance of the policy gradient +estimation is reduced by reusing observations from the old policy. The +single-loop design and the observation reuse effectively reduce the +agent-environment interaction cost and computational complexity. In spite of +the biased policy gradient estimation incurred by the single-loop design and +observation reuse, we prove that the SLDAC with a feasible initial point can +converge to a Karush-Kuhn-Tuker (KKT) point of the original problem almost +surely. Simulations show that the SLDAC algorithm can achieve superior +performance with much lower interaction cost. + +
+
+
+
+
+ + ♻ ☆ Language Models and Retrieval Augmented Generation for Automated + Structured Data Extraction from Diagnostic Reports + + +
+ Purpose: To develop and evaluate an automated system for extracting +structured clinical information from unstructured radiology and pathology +reports using open-weights large language models (LMs) and retrieval augmented +generation (RAG), and to assess the effects of model configuration variables on +extraction performance. Methods and Materials: The study utilized two datasets: +7,294 radiology reports annotated for Brain Tumor Reporting and Data System +(BT-RADS) scores and 2,154 pathology reports annotated for isocitrate +dehydrogenase (IDH) mutation status. An automated pipeline was developed to +benchmark the performance of various LMs and RAG configurations. The impact of +model size, quantization, prompting strategies, output formatting, and +inference parameters was systematically evaluated. Results: The best performing +models achieved over 98% accuracy in extracting BT-RADS scores from radiology +reports and over 90% for IDH mutation status extraction from pathology reports. +The top model being medical fine-tuned llama3. Larger, newer, and domain +fine-tuned models consistently outperformed older and smaller models. Model +quantization had minimal impact on performance. Few-shot prompting +significantly improved accuracy. RAG improved performance for complex pathology +reports but not for shorter radiology reports. Conclusions: Open LMs +demonstrate significant potential for automated extraction of structured +clinical data from unstructured clinical reports with local privacy-preserving +application. Careful model selection, prompt engineering, and semi-automated +optimization using annotated data are critical for optimal performance. These +approaches could be reliable enough for practical use in research workflows, +highlighting the potential for human-machine collaboration in healthcare data +extraction. + +
+
+
+
+
+ + ♻ ☆ OneEncoder: A Lightweight Framework for Progressive Alignment of + Modalities + + +
+ Cross-modal alignment Learning integrates information from different +modalities like text, image, audio and video to create unified models. This +approach develops shared representations and learns correlations between +modalities, enabling applications such as visual question answering and +audiovisual content analysis. Current techniques rely on large +modality-specific encoders, necessitating fine-tuning or training from scratch +on vast aligned datasets (e.g., text-image, text-audio, image-audio). This +approach has limitations: (i) it is very expensive due to the need for training +large encoders on extensive datasets, (ii) acquiring aligned large paired +datasets is challenging, and (iii) adding new modalities requires retraining +the entire framework to incorporate these modalities. To address these issues, +we propose OneEncoder, a lightweight framework that progressively represents +and aligns four modalities (image, text, audio, video). Initially, we train a +lightweight Universal Projection module (UP) to align image and text +modalities. Then, we freeze the pretrained UP and progressively align future +modalities to those already aligned. OneEncoder operates efficiently and +cost-effectively, even in scenarios where vast aligned datasets are +unavailable, due to its lightweight design. Trained on small paired datasets, +it shows strong performance in tasks like classification, querying, and visual +question answering, surpassing methods that rely on large datasets and +specialized encoders. + +
+
+
+
+
+ + ♻ ☆ RetrievalAttention: Accelerating Long-Context LLM Inference via Vector + Retrieval + + +
+ Transformer-based Large Language Models (LLMs) have become increasingly +important. However, due to the quadratic time complexity of attention +computation, scaling LLMs to longer contexts incurs extremely slow inference +latency and high GPU memory consumption for caching key-value (KV) vectors. +This paper proposes RetrievalAttention, a training-free approach to both +accelerate attention computation and reduce GPU memory consumption. By +leveraging the dynamic sparsity of attention mechanism, RetrievalAttention +proposes to use approximate nearest neighbor search (ANNS) indexes for KV +vectors in CPU memory and retrieves the most relevant ones with vector search +during generation. Unfortunately, we observe that the off-the-shelf ANNS +indexes are often ineffective for such retrieval tasks due to the +out-of-distribution (OOD) between query vectors and key vectors in attention +mechanism. RetrievalAttention addresses the OOD challenge by designing an +attention-aware vector search algorithm that can adapt to the distribution of +query vectors. Our evaluation shows that RetrievalAttention only needs to +access 1--3% of data while maintaining high model accuracy. This leads to +significant reduction in the inference cost of long-context LLMs with much +lower GPU memory footprint. In particular, RetrievalAttention only needs a +single NVIDIA RTX4090 (24GB) for serving 128K tokens in LLMs with 8B +parameters, which is capable of generating one token in 0.188 seconds. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ♻ ☆ mLoRA: Fine-Tuning LoRA Adapters via Highly-Efficient Pipeline + Parallelism in Multiple GPUs + + +
+ Transformer-based, pre-trained large language models (LLMs) have demonstrated +outstanding performance across diverse domains, particularly in the emerging +{\em pretrain-then-finetune} paradigm. Low-Rank Adaptation (LoRA), a +parameter-efficient fine-tuning method, is commonly used to adapt a base LLM to +multiple downstream tasks. Further, LLM platforms enable developers to +fine-tune multiple models and develop various domain-specific applications +simultaneously. However, existing model parallelism schemes suffer from high +communication overhead and inefficient GPU utilization when training multiple +LoRA tasks across GPUs and machines. + In this paper, we present mLoRA, a parallelism-efficient fine-tuning system +designed for training multiple LoRA across GPUs and machines. mLoRA introduces +a novel LoRA-aware pipeline parallelism scheme that efficiently pipelines +independent LoRA adapters and their distinct fine-tuning stages across GPUs and +machines, along with a new LoRA-efficient operator to enhance GPU utilization +during pipelined LoRA training. Our extensive evaluation shows that mLoRA can +significantly reduce average fine-tuning task completion time, e.g., by 30\%, +compared to state-of-the-art methods like FSDP. More importantly, mLoRA enables +simultaneous fine-tuning of larger models, e.g., two Llama-2-13B models on four +NVIDIA RTX A6000 48GB GPUs, which is not feasible for FSDP due to high memory +requirements. Hence, mLoRA not only increases fine-tuning efficiency but also +makes it more accessible on cost-effective GPUs. mLoRA has been deployed in +AntGroup's production environment. + +
+
+ comment: 14 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Zero-Shot Conditioning of Score-Based Diffusion Models by Neuro-Symbolic + Constraints + + +
+ Score-based diffusion models have emerged as effective approaches for both +conditional and unconditional generation. Still conditional generation is based +on either a specific training of a conditional model or classifier guidance, +which requires training a noise-dependent classifier, even when a classifier +for uncorrupted data is given. We propose a method that, given a pre-trained +unconditional score-based generative model, samples from the conditional +distribution under arbitrary logical constraints, without requiring additional +training. Differently from other zero-shot techniques, that rather aim at +generating valid conditional samples, our method is designed for approximating +the true conditional distribution. Firstly, we show how to manipulate the +learned score in order to sample from an un-normalized distribution conditional +on a user-defined constraint. Then, we define a flexible and numerically stable +neuro-symbolic framework for encoding soft logical constraints. Combining these +two ingredients we obtain a general, but approximate, conditional sampling +algorithm. We further developed effective heuristics aimed at improving the +approximation. Finally, we show the effectiveness of our approach in +approximating conditional distributions for various types of constraints and +data: tabular data, images and time series. + +
+
+
+
+
+ + ♻ ☆ Data-driven Modeling of Combined Sewer Systems for Urban Sustainability: + An Empirical Evaluation + + +
+ Climate change poses complex challenges, with extreme weather events becoming +increasingly frequent and difficult to model. Examples include the dynamics of +Combined Sewer Systems (CSS). Overburdened CSS during heavy rainfall will +overflow untreated wastewater into surface water bodies. Classical approaches +to modeling the impact of extreme rainfall events rely on physical simulations, +which are particularly challenging to create for large urban infrastructures. +Deep Learning (DL) models offer a cost-effective alternative for modeling the +complex dynamics of sewer systems. In this study, we present a comprehensive +empirical evaluation of several state-of-the-art DL time series models for +predicting sewer system dynamics in a large urban infrastructure, utilizing +three years of measurement data. We especially investigate the potential of DL +models to maintain predictive precision during network outages by comparing +global models, which have access to all variables within the sewer system, and +local models, which are limited to data from a restricted set of local sensors. +Our findings demonstrate that DL models can accurately predict the dynamics of +sewer system load, even under network outage conditions. These results suggest +that DL models can effectively aid in balancing the load redistribution in CSS, +thereby enhancing the sustainability and resilience of urban infrastructures. + +
+
+ comment: 8 pages, 4 figures, accepted at 47th German Conference on Artificial + Intelligence, Wuerzburg 2024 +
+
+
+
+
+ + ♻ ☆ Rényi Divergence Deep Mutual Learning + + +
+ This paper revisits Deep Mutual Learning (DML), a simple yet effective +computing paradigm. We propose using R\'{e}nyi divergence instead of the KL +divergence, which is more flexible and tunable, to improve vanilla DML. This +modification is able to consistently improve performance over vanilla DML with +limited additional complexity. The convergence properties of the proposed +paradigm are analyzed theoretically, and Stochastic Gradient Descent with a +constant learning rate is shown to converge with $\mathcal{O}(1)$-bias in the +worst case scenario for nonconvex optimization tasks. That is, learning will +reach nearby local optima but continue searching within a bounded scope, which +may help mitigate overfitting. Finally, our extensive empirical results +demonstrate the advantage of combining DML and R\'{e}nyi divergence, leading to +further improvement in model generalization. + +
+
+
+
+
+ + ♻ ☆ Interpretable classifiers for tabular data via discretization and + feature selection + + +
+ We introduce a method for computing immediately human interpretable yet +accurate classifiers from tabular data. The classifiers obtained are short +Boolean formulas, computed via first discretizing the original data and then +using feature selection coupled with a very fast algorithm for producing the +best possible Boolean classifier for the setting. We demonstrate the approach +via 12 experiments, obtaining results with accuracies comparable to ones +obtained via random forests, XGBoost, and existing results for the same +datasets in the literature. In most cases, the accuracy of our method is in +fact similar to that of the reference methods, even though the main objective +of our study is the immediate interpretability of our classifiers. We also +prove a new result on the probability that the classifier we obtain from +real-life data corresponds to the ideally best classifier with respect to the +background distribution the data comes from. + +
+
+ comment: Preprint of a paper in DAO-XAI 2024 (Data meets Applied Ontologies in + Explainable AI) +
+
+
+
+
+ + ♻ ☆ A Robust Autoencoder Ensemble-Based Approach for Anomaly Detection in + Text + + +
+ Anomaly detection (AD) is a fast growing and popular domain among established +applications like vision and time series. We observe a rich literature for +these applications, but anomaly detection in text is only starting to blossom. +Recently, self-supervised methods with self-attention mechanism have been the +most popular choice. While recent works have proposed a working ground for +building and benchmarking state of the art approaches, we propose two principal +contributions in this paper: contextual anomaly contamination and a novel +ensemble-based approach. Our method, Textual Anomaly Contamination (TAC), +allows to contaminate inlier classes with either independent or contextual +anomalies. In the literature, it appears that this distinction is not +performed. For finding contextual anomalies, we propose RoSAE, a Robust +Subspace Local Recovery Autoencoder Ensemble. All autoencoders of the ensemble +present a different latent representation through local manifold learning. +Benchmark shows that our approach outperforms recent works on both independent +and contextual anomalies, while being more robust. We also provide 8 dataset +comparison instead of only relying to Reuters and 20 Newsgroups corpora. + +
+
+
+
+
+ + ♻ ☆ Reconciling Kaplan and Chinchilla Scaling Laws + + +
+ Kaplan et al. [2020] (`Kaplan') and Hoffmann et al. [2022] (`Chinchilla') +studied the scaling behavior of transformers trained on next-token language +prediction. These studies produced different estimates for how the number of +parameters ($N$) and training tokens ($D$) should be set to achieve the lowest +possible loss for a given compute budget ($C$). Kaplan: $N_\text{optimal} +\propto C^{0.73}$, Chinchilla: $N_\text{optimal} \propto C^{0.50}$. This paper +finds that much of this discrepancy can be attributed to Kaplan counting +non-embedding rather than total parameters, combined with their analysis being +performed at small scale. Simulating the Chinchilla study under these +conditions produces biased scaling coefficients close to Kaplan's. Hence, this +paper reaffirms Chinchilla's scaling coefficients, by explaining the primary +cause of Kaplan's original overestimation. As a second contribution, the paper +explains differences in the reported relationships between loss and compute. +These findings lead us to recommend that future scaling studies use total +parameters and compute. + +
+
+
+
+
+ + ♻ ☆ Explore-Go: Leveraging Exploration for Generalisation in Deep + Reinforcement Learning + + +
+ One of the remaining challenges in reinforcement learning is to develop +agents that can generalise to novel scenarios they might encounter once +deployed. This challenge is often framed in a multi-task setting where agents +train on a fixed set of tasks and have to generalise to new tasks. Recent work +has shown that in this setting increased exploration during training can be +leveraged to increase the generalisation performance of the agent. This makes +sense when the states encountered during testing can actually be explored +during training. In this paper, we provide intuition why exploration can also +benefit generalisation to states that cannot be explicitly encountered during +training. Additionally, we propose a novel method Explore-Go that exploits this +intuition by increasing the number of states on which the agent trains. +Explore-Go effectively increases the starting state distribution of the agent +and as a result can be used in conjunction with most existing on-policy or +off-policy reinforcement learning algorithms. We show empirically that our +method can increase generalisation performance in an illustrative environment +and on the Procgen benchmark. + +
+
+
+
+
+ + ♻ ☆ Retrofitting Temporal Graph Neural Networks with Transformer + + +
+ Temporal graph neural networks (TGNNs) outperform regular GNNs by +incorporating time information into graph-based operations. However, TGNNs +adopt specialized models (e.g., TGN, TGAT, and APAN ) and require tailored +training frameworks (e.g., TGL and ETC). In this paper, we propose TF-TGN, +which uses Transformer decoder as the backbone model for TGNN to enjoy +Transformer's codebase for efficient training. In particular, Transformer +achieves tremendous success for language modeling, and thus the community +developed high-performance kernels (e.g., flash-attention and memory-efficient +attention) and efficient distributed training schemes (e.g., PyTorch FSDP, +DeepSpeed, and Megatron-LM). We observe that TGNN resembles language modeling, +i.e., the message aggregation operation between chronologically occurring nodes +and their temporal neighbors in TGNNs can be structured as sequence modeling. +Beside this similarity, we also incorporate a series of algorithm designs +including suffix infilling, temporal graph attention with self-loop, and causal +masking self-attention to make TF-TGN work. During training, existing systems +are slow in transforming the graph topology and conducting graph sampling. As +such, we propose methods to parallelize the CSR format conversion and graph +sampling. We also adapt Transformer codebase to train TF-TGN efficiently with +multiple GPUs. We experiment with 9 graphs and compare with 2 state-of-the-art +TGNN training frameworks. The results show that TF-TGN can accelerate training +by over 2.20 while providing comparable or even superior accuracy to existing +SOTA TGNNs. TF-TGN is available at https://github.com/qianghuangwhu/TF-TGN. + +
+
+ comment: conference Under review +
+
+
+
+
+ + ♻ ☆ On the Statistical Complexity of Estimation and Testing under Privacy + Constraints + + +
+ The challenge of producing accurate statistics while respecting the privacy +of the individuals in a sample is an important area of research. We study +minimax lower bounds for classes of differentially private estimators. In +particular, we show how to characterize the power of a statistical test under +differential privacy in a plug-and-play fashion by solving an appropriate +transport problem. With specific coupling constructions, this observation +allows us to derive Le Cam-type and Fano-type inequalities not only for regular +definitions of differential privacy but also for those based on Renyi +divergence. We then proceed to illustrate our results on three simple, fully +worked out examples. In particular, we show that the problem class has a huge +importance on the provable degradation of utility due to privacy. In certain +scenarios, we show that maintaining privacy results in a noticeable reduction +in performance only when the level of privacy protection is very high. +Conversely, for other problems, even a modest level of privacy protection can +lead to a significant decrease in performance. Finally, we demonstrate that the +DP-SGLD algorithm, a private convex solver, can be employed for maximum +likelihood estimation with a high degree of confidence, as it provides +near-optimal results with respect to both the size of the sample and the level +of privacy protection. This algorithm is applicable to a broad range of +parametric estimation procedures, including exponential families. + +
+
+
+
+
+ + ♻ ☆ Specify What? Enhancing Neural Specification Synthesis by Symbolic + Methods + + +
+ We investigate how combinations of Large Language Models (LLMs) and symbolic +analyses can be used to synthesise specifications of C programs. The LLM +prompts are augmented with outputs from two formal methods tools in the Frama-C +ecosystem, Pathcrawler and EVA, to produce C program annotations in the +specification language ACSL. We demonstrate how the addition of symbolic +analysis to the workflow impacts the quality of annotations: information about +input/output examples from Pathcrawler produce more context-aware annotations, +while the inclusion of EVA reports yields annotations more attuned to runtime +errors. In addition, we show that the method infers rather the programs intent +than its behaviour, by generating specifications for buggy programs and +observing robustness of the result against bugs. + +
+
+
+
+
+ + ♻ ☆ Deep Neural Network Benchmarks for Selective Classification + + +
+ With the increasing deployment of machine learning models in many socially +sensitive tasks, there is a growing demand for reliable and trustworthy +predictions. One way to accomplish these requirements is to allow a model to +abstain from making a prediction when there is a high risk of making an error. +This requires adding a selection mechanism to the model, which selects those +examples for which the model will provide a prediction. The selective +classification framework aims to design a mechanism that balances the fraction +of rejected predictions (i.e., the proportion of examples for which the model +does not make a prediction) versus the improvement in predictive performance on +the selected predictions. Multiple selective classification frameworks exist, +most of which rely on deep neural network architectures. However, the empirical +evaluation of the existing approaches is still limited to partial comparisons +among methods and settings, providing practitioners with little insight into +their relative merits. We fill this gap by benchmarking 18 baselines on a +diverse set of 44 datasets that includes both image and tabular data. Moreover, +there is a mix of binary and multiclass tasks. We evaluate these approaches +using several criteria, including selective error rate, empirical coverage, +distribution of rejected instance's classes, and performance on +out-of-distribution instances. The results indicate that there is not a single +clear winner among the surveyed baselines, and the best method depends on the +users' objectives. + +
+
+ comment: Published in The Journal of Data centric Machine Learning Research + (DMLR), Vol 1, (17):1-58 (2024) +
+
+
+
+
+ + ♻ ☆ Improving Conditional Level Generation using Automated Validation in + Match-3 Games + + +
+ Generative models for level generation have shown great potential in game +production. However, they often provide limited control over the generation, +and the validity of the generated levels is unreliable. Despite this fact, only +a few approaches that learn from existing data provide the users with ways of +controlling the generation, simultaneously addressing the generation of +unsolvable levels. %One of the main challenges it faces is that levels +generated through automation may not be solvable thus requiring validation. are +not always engaging, challenging, or even solvable. This paper proposes Avalon, +a novel method to improve models that learn from existing level designs using +difficulty statistics extracted from gameplay. In particular, we use a +conditional variational autoencoder to generate layouts for match-3 levels, +conditioning the model on pre-collected statistics such as game mechanics like +difficulty and relevant visual features like size and symmetry. Our method is +general enough that multiple approaches could potentially be used to generate +these statistics. We quantitatively evaluate our approach by comparing it to an +ablated model without difficulty conditioning. Additionally, we analyze both +quantitatively and qualitatively whether the style of the dataset is preserved +in the generated levels. Our approach generates more valid levels than the same +method without difficulty conditioning. + +
+
+ comment: 10 pages, 5 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Leveraging Large Language Models for Solving Rare MIP Challenges + + +
+ Mixed Integer Programming (MIP) has been extensively applied in areas +requiring mathematical solvers to address complex instances within tight time +constraints. However, as the problem scale increases, the complexity of model +formulation and finding feasible solutions escalates significantly. In +contrast, the model-building cost for end-to-end models, such as large language +models (LLMs), remains largely unaffected by problem scale due to their pattern +recognition capabilities. While LLMs, like GPT-4, without fine-tuning, can +handle some traditional medium-scale MIP problems, they struggle with uncommon +or highly specialized MIP scenarios. Fine-tuning LLMs can yield some feasible +solutions for medium-scale MIP instances, but these models typically fail to +explore diverse solutions when constrained by a low and constant temperature, +limiting their performance. In this paper, we propose and evaluate a +recursively dynamic temperature method integrated with a chain-of-thought +approach. Our findings show that starting with a high temperature and gradually +lowering it leads to better feasible solutions compared to other dynamic +temperature strategies. Additionally, by comparing results generated by the LLM +with those from Gurobi, we demonstrate that the LLM can produce solutions that +complement traditional solvers by accelerating the pruning process and +improving overall efficiency. + +
+
+
+
+
+ + ♻ ☆ Active learning for energy-based antibody optimization and enhanced + screening + + +
+ Accurate prediction and optimization of protein-protein binding affinity is +crucial for therapeutic antibody development. Although machine learning-based +prediction methods $\Delta\Delta G$ are suitable for large-scale mutant +screening, they struggle to predict the effects of multiple mutations for +targets without existing binders. Energy function-based methods, though more +accurate, are time consuming and not ideal for large-scale screening. To +address this, we propose an active learning workflow that efficiently trains a +deep learning model to learn energy functions for specific targets, combining +the advantages of both approaches. Our method integrates the RDE-Network deep +learning model with Rosetta's energy function-based Flex ddG to efficiently +explore mutants. In a case study targeting HER2-binding Trastuzumab mutants, +our approach significantly improved the screening performance over random +selection and demonstrated the ability to identify mutants with better binding +properties without experimental $\Delta\Delta G$ data. This workflow advances +computational antibody design by combining machine learning, physics-based +computations, and active learning to achieve more efficient antibody +development. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ Exploring Fine-tuned Generative Models for Keyphrase Selection: A Case + Study for Russian + + +
+ Keyphrase selection plays a pivotal role within the domain of scholarly +texts, facilitating efficient information retrieval, summarization, and +indexing. In this work, we explored how to apply fine-tuned generative +transformer-based models to the specific task of keyphrase selection within +Russian scientific texts. We experimented with four distinct generative models, +such as ruT5, ruGPT, mT5, and mBART, and evaluated their performance in both +in-domain and cross-domain settings. The experiments were conducted on the +texts of Russian scientific abstracts from four domains: mathematics & computer +science, history, medicine, and linguistics. The use of generative models, +namely mBART, led to gains in in-domain performance (up to 4.9% in BERTScore, +9.0% in ROUGE-1, and 12.2% in F1-score) over three keyphrase extraction +baselines for the Russian language. Although the results for cross-domain usage +were significantly lower, they still demonstrated the capability to surpass +baseline performances in several cases, underscoring the promising potential +for further exploration and refinement in this research field. + +
+
+ comment: DAMDID-2024 +
+
+
+
+
+ + ♻ ☆ Probability Passing for Graph Neural Networks: Graph Structure and + Representations Joint Learning + + +
+ Graph Neural Networks (GNNs) have achieved notable success in the analysis of +non-Euclidean data across a wide range of domains. However, their applicability +is constrained by the dependence on the observed graph structure. To solve this +problem, Latent Graph Inference (LGI) is proposed to infer a task-specific +latent structure by computing similarity or edge probability of node features +and then apply a GNN to produce predictions. Even so, existing approaches +neglect the noise from node features, which affects generated graph structure +and performance. In this work, we introduce a novel method called Probability +Passing to refine the generated graph structure by aggregating edge +probabilities of neighboring nodes based on observed graph. Furthermore, we +continue to utilize the LGI framework, inputting the refined graph structure +and node features into GNNs to obtain predictions. We name the proposed scheme +as Probability Passing-based Graph Neural Network (PPGNN). Moreover, the +anchor-based technique is employed to reduce complexity and improve efficiency. +Experimental results demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ♻ ☆ Mask-Encoded Sparsification: Mitigating Biased Gradients in + Communication-Efficient Split Learning + + +
+ This paper introduces a novel framework designed to achieve a high +compression ratio in Split Learning (SL) scenarios where resource-constrained +devices are involved in large-scale model training. Our investigations +demonstrate that compressing feature maps within SL leads to biased gradients +that can negatively impact the convergence rates and diminish the +generalization capabilities of the resulting models. Our theoretical analysis +provides insights into how compression errors critically hinder SL performance, +which previous methodologies underestimate. To address these challenges, we +employ a narrow bit-width encoded mask to compensate for the sparsification +error without increasing the order of time complexity. Supported by rigorous +theoretical analysis, our framework significantly reduces compression errors +and accelerates the convergence. Extensive experiments also verify that our +method outperforms existing solutions regarding training efficiency and +communication complexity. + +
+
+
+
+
+ + ♻ ☆ Memory Gym: Towards Endless Tasks to Benchmark Memory Capabilities of + Agents + + +
+ Memory Gym presents a suite of 2D partially observable environments, namely +Mortar Mayhem, Mystery Path, and Searing Spotlights, designed to benchmark +memory capabilities in decision-making agents. These environments, originally +with finite tasks, are expanded into innovative, endless formats, mirroring the +escalating challenges of cumulative memory games such as ``I packed my bag''. +This progression in task design shifts the focus from merely assessing sample +efficiency to also probing the levels of memory effectiveness in dynamic, +prolonged scenarios. To address the gap in available memory-based Deep +Reinforcement Learning baselines, we introduce an implementation that +integrates Transformer-XL (TrXL) with Proximal Policy Optimization. This +approach utilizes TrXL as a form of episodic memory, employing a sliding window +technique. Our comparative study between the Gated Recurrent Unit (GRU) and +TrXL reveals varied performances across different settings. TrXL, on the finite +environments, demonstrates superior sample efficiency in Mystery Path and +outperforms in Mortar Mayhem. However, GRU is more efficient on Searing +Spotlights. Most notably, in all endless tasks, GRU makes a remarkable +resurgence, consistently outperforming TrXL by significant margins. Website and +Source Code: https://github.com/MarcoMeter/endless-memory-gym/ + +
+
+ comment: 40 pages, 12 figures, 7 tables, under review +
+
+
+
+
+ + ♻ ☆ Tracking-Assisted Object Detection with Event Cameras + + +
+ Event-based object detection has recently garnered attention in the computer +vision community due to the exceptional properties of event cameras, such as +high dynamic range and no motion blur. However, feature asynchronism and +sparsity cause invisible objects due to no relative motion to the camera, +posing a significant challenge in the task. Prior works have studied various +implicit-learned memories to retain as many temporal cues as possible. However, +implicit memories still struggle to preserve long-term features effectively. In +this paper, we consider those invisible objects as pseudo-occluded objects and +aim to detect them by tracking through occlusions. Firstly, we introduce the +visibility attribute of objects and contribute an auto-labeling algorithm to +not only clean the existing event camera dataset but also append additional +visibility labels to it. Secondly, we exploit tracking strategies for +pseudo-occluded objects to maintain their permanence and retain their bounding +boxes, even when features have not been available for a very long time. These +strategies can be treated as an explicit-learned memory guided by the tracking +objective to record the displacements of objects across frames. Lastly, we +propose a spatio-temporal feature aggregation module to enrich the latent +features and a consistency loss to increase the robustness of the overall +pipeline. We conduct comprehensive experiments to verify our method's +effectiveness where still objects are retained, but real occluded objects are +discarded. The results demonstrate that (1) the additional visibility labels +can assist in supervised training, and (2) our method outperforms +state-of-the-art approaches with a significant improvement of 7.9% absolute +mAP. + +
+
+
+
+
+ + ♻ ☆ Model-agnostic clean-label backdoor mitigation in cybersecurity + environments + + +
+ The training phase of machine learning models is a delicate step, especially +in cybersecurity contexts. Recent research has surfaced a series of insidious +training-time attacks that inject backdoors in models designed for security +classification tasks without altering the training labels. With this work, we +propose new techniques that leverage insights in cybersecurity threat models to +effectively mitigate these clean-label poisoning attacks, while preserving the +model utility. By performing density-based clustering on a carefully chosen +feature subspace, and progressively isolating the suspicious clusters through a +novel iterative scoring procedure, our defensive mechanism can mitigate the +attacks without requiring many of the common assumptions in the existing +backdoor defense literature. To show the generality of our proposed mitigation, +we evaluate it on two clean-label model-agnostic attacks on two different +classic cybersecurity data modalities: network flows classification and malware +classification, using gradient boosting and neural network models. + +
+
+ comment: 14 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ On the Statistical Complexity of Sample Amplification + + +
+ The ``sample amplification'' problem formalizes the following question: Given +$n$ i.i.d. samples drawn from an unknown distribution $P$, when is it possible +to produce a larger set of $n+m$ samples which cannot be distinguished from +$n+m$ i.i.d. samples drawn from $P$? In this work, we provide a firm +statistical foundation for this problem by deriving generally applicable +amplification procedures, lower bound techniques and connections to existing +statistical notions. Our techniques apply to a large class of distributions +including the exponential family, and establish a rigorous connection between +sample amplification and distribution learning. + +
+
+ comment: To appear in the Annals of Statistics +
+
+
+
+
+ + ♻ ☆ FedVeca: Federated Vectorized Averaging on Non-IID Data with Adaptive + Bi-directional Global Objective + + +
+ Federated Learning (FL) is a distributed machine learning framework to +alleviate the data silos, where decentralized clients collaboratively learn a +global model without sharing their private data. However, the clients' +Non-Independent and Identically Distributed (Non-IID) data negatively affect +the trained model, and clients with different numbers of local updates may +cause significant gaps to the local gradients in each communication round. In +this paper, we propose a Federated Vectorized Averaging (FedVeca) method to +address the above problem on Non-IID data. Specifically, we set a novel +objective for the global model which is related to the local gradients. The +local gradient is defined as a bi-directional vector with step size and +direction, where the step size is the number of local updates and the direction +is divided into positive and negative according to our definition. In FedVeca, +the direction is influenced by the step size, thus we average the +bi-directional vectors to reduce the effect of different step sizes. Then, we +theoretically analyze the relationship between the step sizes and the global +objective, and obtain upper bounds on the step sizes per communication round. +Based on the upper bounds, we design an algorithm for the server and the client +to adaptively adjusts the step sizes that make the objective close to the +optimum. Finally, we conduct experiments on different datasets, models and +scenarios by building a prototype system, and the experimental results +demonstrate the effectiveness and efficiency of the FedVeca method. + +
+
+
+
+
+ + ♻ ☆ Mobility-GCN: a human mobility-based graph convolutional network for + tracking and analyzing the spatial dynamics of the synthetic opioid crisis in + the USA, 2013-2020 + + +
+ Synthetic opioids are the most common drugs involved in drug-involved +overdose mortalities in the U.S. The Center for Disease Control and Prevention +reported that in 2018, about 70% of all drug overdose deaths involved opioids +and 67% of all opioid-involved deaths were accounted for by synthetic opioids. +In this study, we investigated the spread of synthetic opioids between 2013 and +2020 in the U.S. We analyzed the relationship between the spatiotemporal +pattern of synthetic opioid-involved deaths and another key opioid, heroin, and +compared patterns of deaths involving these two types of drugs during this +period. Spatial connections and human mobility between counties were +incorporated into a graph convolutional neural network model to represent and +analyze the spread of synthetic opioid-involved deaths in the context of +previous heroin-involved death patterns. + +
+
+
+
+
+ + ♻ ☆ On Fairness of Low-Rank Adaptation of Large Models + + +
+ Low-rank adaptation of large models, particularly LoRA, has gained traction +due to its computational efficiency. This efficiency, contrasted with the +prohibitive costs of full-model fine-tuning, means that practitioners often +turn to LoRA and sometimes without a complete understanding of its +ramifications. In this study, we focus on fairness and ask whether LoRA has an +unexamined impact on utility, calibration, and resistance to membership +inference across different subgroups (e.g., genders, races, religions) compared +to a full-model fine-tuning baseline. We present extensive experiments across +vision and language domains and across classification and generation tasks +using ViT-Base, Swin-v2-Large, Llama-2 7B, and Mistral 7B. Intriguingly, +experiments suggest that while one can isolate cases where LoRA exacerbates +model bias across subgroups, the pattern is inconsistent -- in many cases, LoRA +has equivalent or even improved fairness compared to the base model or its full +fine-tuning baseline. We also examine the complications of evaluating +fine-tuning fairness relating to task design and model token bias, calling for +more careful fairness evaluations in future work. + +
+
+ comment: COLM 2024 camera ready +
+
+
+
+
+ + ♻ ☆ Residual Back Projection With Untrained Neural Networks + + +
+ Background and Objective: The success of neural networks in a number of image +processing tasks has motivated their application in image reconstruction +problems in computed tomography (CT). While progress has been made in this +area, the lack of stability and theoretical guarantees for accuracy, together +with the scarcity of high-quality training data for specific imaging domains +pose challenges for many CT applications. In this paper, we present a framework +for iterative reconstruction (IR) in CT that leverages the hierarchical +structure of neural networks, without the need for training. Our framework +incorporates this structural information as a deep image prior (DIP), and uses +a novel residual back projection (RBP) connection that forms the basis for our +iterations. + Methods: We propose using an untrained U-net in conjunction with a novel +residual back projection to minimize an objective function and achieve +high-accuracy reconstruction. In each iteration, the weights of the untrained +U-net are optimized, and the output of the U-net in the current iteration is +used to update the input of the U-net in the next iteration through the +aforementioned RBP connection. + Results: Experimental results demonstrate that the RBP-DIP framework offers +improvements over other state-of-the-art conventional IR methods, as well as +pre-trained and untrained models with similar network structures under multiple +conditions. These improvements are particularly significant in the few-view, +limited-angle, and low-dose imaging configurations. + Conclusions: Applying to both parallel and fan beam X-ray imaging, our +framework shows significant improvement under multiple conditions. Furthermore, +the proposed framework requires no training data and can be adjusted on-demand +to adapt to different conditions (e.g. noise level, geometry, and imaged +object). + +
+
+
+
+
+ + ♻ ☆ Enhancing Changepoint Detection: Penalty Learning through Deep Learning + Techniques + + +
+ Changepoint detection, a technique for identifying significant shifts within +data sequences, is crucial in various fields such as finance, genomics, +medicine, etc. Dynamic programming changepoint detection algorithms are +employed to identify the locations of changepoints within a sequence, which +rely on a penalty parameter to regulate the number of changepoints. To estimate +this penalty parameter, previous work uses simple models such as linear or +tree-based models. This study introduces a novel deep learning method for +predicting penalty parameters, leading to demonstrably improved changepoint +detection accuracy on large benchmark supervised labeled datasets compared to +previous methods. + +
+
+ comment: 17 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Aspect-based Sentiment Analysis: Domains, + Methods, and Trends + + +
+ Aspect-based sentiment analysis (ABSA) is a fine-grained type of sentiment +analysis that identifies aspects and their associated opinions from a given +text. With the surge of digital opinionated text data, ABSA gained increasing +popularity for its ability to mine more detailed and targeted insights. Many +review papers on ABSA subtasks and solution methodologies exist, however, few +focus on trends over time or systemic issues relating to research application +domains, datasets, and solution approaches. To fill the gap, this paper +presents a systematic literature review (SLR) of ABSA studies with a focus on +trends and high-level relationships among these fundamental components. This +review is one of the largest SLRs on ABSA. To our knowledge, it is also the +first to systematically examine the interrelations among ABSA research and data +distribution across domains, as well as trends in solution paradigms and +approaches. Our sample includes 727 primary studies screened from 8550 search +results without time constraints via an innovative automatic filtering process. +Our quantitative analysis not only identifies trends in nearly two decades of +ABSA research development but also unveils a systemic lack of dataset and +domain diversity as well as domain mismatch that may hinder the development of +future ABSA research. We discuss these findings and their implications and +propose suggestions for future research. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ Vista3D: Unravel the 3D Darkside of a Single Image ECCV'2024 + + +
+ We embark on the age-old quest: unveiling the hidden dimensions of objects +from mere glimpses of their visible parts. To address this, we present Vista3D, +a framework that realizes swift and consistent 3D generation within a mere 5 +minutes. At the heart of Vista3D lies a two-phase approach: the coarse phase +and the fine phase. In the coarse phase, we rapidly generate initial geometry +with Gaussian Splatting from a single image. In the fine phase, we extract a +Signed Distance Function (SDF) directly from learned Gaussian Splatting, +optimizing it with a differentiable isosurface representation. Furthermore, it +elevates the quality of generation by using a disentangled representation with +two independent implicit functions to capture both visible and obscured aspects +of objects. Additionally, it harmonizes gradients from 2D diffusion prior with +3D-aware diffusion priors by angular diffusion prior composition. Through +extensive evaluation, we demonstrate that Vista3D effectively sustains a +balance between the consistency and diversity of the generated 3D objects. +Demos and code will be available at https://github.com/florinshen/Vista3D. + +
+
+ comment: ECCV'2024 +
+
+
+
+
+ + ☆ MoRAG -- Multi-Fusion Retrieval Augmented Generation for Human Motion + + +
+ We introduce MoRAG, a novel multi-part fusion based retrieval-augmented +generation strategy for text-based human motion generation. The method enhances +motion diffusion models by leveraging additional knowledge obtained through an +improved motion retrieval process. By effectively prompting large language +models (LLMs), we address spelling errors and rephrasing issues in motion +retrieval. Our approach utilizes a multi-part retrieval strategy to improve the +generalizability of motion retrieval across the language space. We create +diverse samples through the spatial composition of the retrieved motions. +Furthermore, by utilizing low-level, part-specific motion information, we can +construct motion samples for unseen text descriptions. Our experiments +demonstrate that our framework can serve as a plug-and-play module, improving +the performance of motion diffusion models. Code, pretrained models and sample +videos will be made available at: https://motion-rag.github.io/ + +
+
+
+
+
+ + ☆ Efficient Low-Resolution Face Recognition via Bridge Distillation + + +
+ Face recognition in the wild is now advancing towards light-weight models, +fast inference speed and resolution-adapted capability. In this paper, we +propose a bridge distillation approach to turn a complex face model pretrained +on private high-resolution faces into a light-weight one for low-resolution +face recognition. In our approach, such a cross-dataset resolution-adapted +knowledge transfer problem is solved via two-step distillation. In the first +step, we conduct cross-dataset distillation to transfer the prior knowledge +from private high-resolution faces to public high-resolution faces and generate +compact and discriminative features. In the second step, the resolution-adapted +distillation is conducted to further transfer the prior knowledge to synthetic +low-resolution faces via multi-task learning. By learning low-resolution face +representations and mimicking the adapted high-resolution knowledge, a +light-weight student model can be constructed with high efficiency and +promising accuracy in recognizing low-resolution faces. Experimental results +show that the student model performs impressively in recognizing low-resolution +faces with only 0.21M parameters and 0.057MB memory. Meanwhile, its speed +reaches up to 14,705, ~934 and 763 faces per second on GPU, CPU and mobile +phone, respectively. + +
+
+ comment: This paper is published in IEEE TIP 2020 +
+
+
+
+
+ + ☆ DETECLAP: Enhancing Audio-Visual Representation Learning with Object + Information + + +
+ Current audio-visual representation learning can capture rough object +categories (e.g., ``animals'' and ``instruments''), but it lacks the ability to +recognize fine-grained details, such as specific categories like ``dogs'' and +``flutes'' within animals and instruments. To address this issue, we introduce +DETECLAP, a method to enhance audio-visual representation learning with object +information. Our key idea is to introduce an audio-visual label prediction loss +to the existing Contrastive Audio-Visual Masked AutoEncoder to enhance its +object awareness. To avoid costly manual annotations, we prepare object labels +from both audio and visual inputs using state-of-the-art language-audio models +and object detectors. We evaluate the method of audio-visual retrieval and +classification using the VGGSound and AudioSet20K datasets. Our method achieves +improvements in recall@10 of +1.5% and +1.2% for audio-to-visual and +visual-to-audio retrieval, respectively, and an improvement in accuracy of ++0.6% for audio-visual classification. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ Performance Evaluation of Associative Watermarking Using Statistical + Neurodynamics + + +
+ We theoretically evaluated the performance of our proposed associative +watermarking method in which the watermark is not embedded directly into the +image. We previously proposed a watermarking method that extends the +zero-watermarking model by applying associative memory models. In this model, +the hetero-associative memory model is introduced to the mapping process +between image features and watermarks, and the auto-associative memory model is +applied to correct watermark errors. We herein show that the associative +watermarking model outperforms the zero-watermarking model through computer +simulations using actual images. In this paper, we describe how we derive the +macroscopic state equation for the associative watermarking model using the +Okada theory. The theoretical results obtained by the fourth-order theory were +in good agreement with those obtained by computer simulations. Furthermore, the +performance of the associative watermarking model was evaluated using the bit +error rate of the watermark, both theoretically and using computer simulations. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Official-NV: An LLM-Generated News Video Dataset for Multimodal Fake + News Detection + + +
+ News media, especially video news media, have penetrated into every aspect of +daily life, which also brings the risk of fake news. Therefore, multimodal fake +news detection has recently garnered increased attention. However, the existing +datasets are comprised of user-uploaded videos and contain an excess amounts of +superfluous data, which introduces noise into the model training process. To +address this issue, we construct a dataset named Official-NV, comprising +officially published news videos. The crawl officially published videos are +augmented through the use of LLMs-based generation and manual verification, +thereby expanding the dataset. Furthermore, the proposed dataset is benchmarked +against several baselines to demonstrate its effectiveness in multimodal news +detection. + +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 65 + +
+
+
+ + ☆ Towards Fair RAG: On the Impact of Fair Ranking in Retrieval-Augmented + Generation + + +
+ Many language models now enhance their responses with retrieval capabilities, +leading to the widespread adoption of retrieval-augmented generation (RAG) +systems. However, despite retrieval being a core component of RAG, much of the +research in this area overlooks the extensive body of work on fair ranking, +neglecting the importance of considering all stakeholders involved. This paper +presents the first systematic evaluation of RAG systems integrated with fair +rankings. We focus specifically on measuring the fair exposure of each relevant +item across the rankings utilized by RAG systems (i.e., item-side fairness), +aiming to promote equitable growth for relevant item providers. To gain a deep +understanding of the relationship between item-fairness, ranking quality, and +generation quality in the context of RAG, we analyze nine different RAG systems +that incorporate fair rankings across seven distinct datasets. Our findings +indicate that RAG systems with fair rankings can maintain a high level of +generation quality and, in many cases, even outperform traditional RAG systems, +despite the general trend of a tradeoff between ensuring fairness and +maintaining system-effectiveness. We believe our insights lay the groundwork +for responsible and equitable RAG systems and open new avenues for future +research. We publicly release our codebase and dataset at +https://github.com/kimdanny/Fair-RAG. + +
+
+
+
+
+ + ☆ ProSLM : A Prolog Synergized Language Model for explainable Domain + Specific Knowledge Based Question Answering + + +
+ Neurosymbolic approaches can add robustness to opaque neural systems by +incorporating explainable symbolic representations. However, previous +approaches have not used formal logic to contextualize queries to and validate +outputs of large language models (LLMs). We propose \systemname{}, a novel +neurosymbolic framework, to improve the robustness and reliability of LLMs in +question-answering tasks. We provide \systemname{} with a domain-specific +knowledge base, a logical reasoning system, and an integration to an existing +LLM. This framework has two capabilities (1) context gathering: generating +explainable and relevant context for a given query, and (2) validation: +confirming and validating the factual accuracy of a statement in accordance +with a knowledge base (KB). Our work opens a new area of neurosymbolic +generative AI text validation and user personalization. + +
+
+ comment: Accepted at NeSy 2024 +
+
+
+
+
+ + ☆ HEARTS: A Holistic Framework for Explainable, Sustainable and Robust + Text Stereotype Detection NeurIPS 2024 + + +
+ Stereotypes are generalised assumptions about societal groups, and even +state-of-the-art LLMs using in-context learning struggle to identify them +accurately. Due to the subjective nature of stereotypes, where what constitutes +a stereotype can vary widely depending on cultural, social, and individual +perspectives, robust explainability is crucial. Explainable models ensure that +these nuanced judgments can be understood and validated by human users, +promoting trust and accountability. We address these challenges by introducing +HEARTS (Holistic Framework for Explainable, Sustainable, and Robust Text +Stereotype Detection), a framework that enhances model performance, minimises +carbon footprint, and provides transparent, interpretable explanations. We +establish the Expanded Multi-Grain Stereotype Dataset (EMGSD), comprising +57,201 labeled texts across six groups, including under-represented +demographics like LGBTQ+ and regional stereotypes. Ablation studies confirm +that BERT models fine-tuned on EMGSD outperform those trained on individual +components. We then analyse a fine-tuned, carbon-efficient ALBERT-V2 model +using SHAP to generate token-level importance values, ensuring alignment with +human understanding, and calculate explainability confidence scores by +comparing SHAP and LIME outputs. Finally, HEARTS is applied to assess +stereotypical bias in 12 LLM outputs, revealing a gradual reduction in bias +over time within model families. + +
+
+ comment: Submitted to NeurIPS 2024 SoLaR Workshop +
+
+
+
+
+ + ☆ Preference Tuning with Human Feedback on Language, Speech, and Vision + Tasks: A Survey + + +
+ Preference tuning is a crucial process for aligning deep generative models +with human preferences. This survey offers a thorough overview of recent +advancements in preference tuning and the integration of human feedback. The +paper is organized into three main sections: 1) introduction and preliminaries: +an introduction to reinforcement learning frameworks, preference tuning tasks, +models, and datasets across various modalities: language, speech, and vision, +as well as different policy approaches, 2) in-depth examination of each +preference tuning approach: a detailed analysis of the methods used in +preference tuning, and 3) applications, discussion, and future directions: an +exploration of the applications of preference tuning in downstream tasks, +including evaluation methods for different modalities, and an outlook on future +research directions. Our objective is to present the latest methodologies in +preference tuning and model alignment, enhancing the understanding of this +field for researchers and practitioners. We hope to encourage further +engagement and innovation in this area. + +
+
+ comment: Survey paper +
+
+
+
+
+ + ☆ Small Language Models can Outperform Humans in Short Creative Writing: A + Study Comparing SLMs with Humans and LLMs + + +
+ In this paper, we evaluate the creative fiction writing abilities of a +fine-tuned small language model (SLM), BART Large, and compare its performance +to humans and two large language models (LLMs): GPT-3.5 and GPT-4o. Our +evaluation consists of two experiments: (i) a human evaluation where readers +assess the stories generated by the SLM compared to human-written stories, and +(ii) a qualitative linguistic analysis comparing the textual characteristics of +the stories generated by the different models. In the first experiment, we +asked 68 participants to rate short stories generated by the models and humans +along dimensions such as grammaticality, relevance, creativity, and +attractiveness. BART Large outperformed human writers in most aspects, except +creativity, with an overall score of 2.11 compared to 1.85 for human-written +texts -- a 14% improvement. In the second experiment, the qualitative analysis +revealed that, while GPT-4o exhibited near-perfect internal and external +coherence, it tended to produce more predictable narratives, with only 3% of +its stories seen as novel. In contrast, 15% of BART's stories were considered +novel, indicating a higher degree of creativity despite its smaller model size. +This study provides both quantitative and qualitative insights into how model +size and fine-tuning influence the balance between creativity, fluency, and +coherence in creative writing tasks. + +
+
+
+
+
+ + ☆ Chain-of-Thought Prompting for Speech Translation + + +
+ Large language models (LLMs) have demonstrated remarkable advancements in +language understanding and generation. Building on the success of text-based +LLMs, recent research has adapted these models to use speech embeddings for +prompting, resulting in Speech-LLM models that exhibit strong performance in +automatic speech recognition (ASR) and automatic speech translation (AST). In +this work, we propose a novel approach to leverage ASR transcripts as prompts +for AST in a Speech-LLM built on an encoder-decoder text LLM. The Speech-LLM +model consists of a speech encoder and an encoder-decoder structure +Megatron-T5. By first decoding speech to generate ASR transcripts and +subsequently using these transcripts along with encoded speech for prompting, +we guide the speech translation in a two-step process like chain-of-thought +(CoT) prompting. Low-rank adaptation (LoRA) is used for the T5 LLM for model +adaptation and shows superior performance to full model fine-tuning. +Experimental results show that the proposed CoT prompting significantly +improves AST performance, achieving an average increase of 2.4 BLEU points +across 6 En->X or X->En AST tasks compared to speech prompting alone. +Additionally, compared to a related CoT prediction method that predicts a +concatenated sequence of ASR and AST transcripts, our method performs better by +an average of 2 BLEU points. + +
+
+
+
+
+ + ☆ Egalitarian Language Representation in Language Models: It All Begins + with Tokenizers + + +
+ Tokenizers act as a bridge between human language and the latent space of +language models, influencing how language is represented in these models. Due +to the immense popularity of English-Centric Large Language Models (LLMs), +efforts are being made to adapt them for other languages. However, we +demonstrate that, from a tokenization standpoint, not all tokenizers offer fair +representation for complex script languages such as Tamil, Sinhala, and Hindi, +primarily due to the choice of pre-tokenization methods. We go further to show +that pre-tokenization plays a more critical role than the tokenization +algorithm itself in achieving an egalitarian representation of these complex +script languages. To address this, we introduce an improvement to the Byte Pair +Encoding (BPE) algorithm by incorporating graphemes, which we term Grapheme +Pair Encoding (GPE). Our experiments show that grapheme-based character +extraction outperforms byte-level tokenizers for complex scripts. We validate +this approach through experiments on Tamil, Sinhala, and Hindi. + +
+
+ comment: Content - 8 pages, References - 3 pages +
+
+
+
+
+ + ☆ Multi-Document Grounded Multi-Turn Synthetic Dialog Generation + + +
+ We introduce a technique for multi-document grounded multi-turn synthetic +dialog generation that incorporates three main ideas. First, we control the +overall dialog flow using taxonomy-driven user queries that are generated with +Chain-of-Thought (CoT) prompting. Second, we support the generation of +multi-document grounded dialogs by mimicking real-world use of retrievers to +update the grounding documents after every user-turn in the dialog. Third, we +apply LLM-as-a-Judge to filter out queries with incorrect answers. Human +evaluation of the synthetic dialog data suggests that the data is diverse, +coherent, and includes mostly correct answers. Both human and automatic +evaluations of answerable queries indicate that models fine-tuned on synthetic +dialogs consistently out-perform those fine-tuned on existing human generated +training data across four publicly available multi-turn document grounded +benchmark test sets. + +
+
+
+
+
+ + ☆ Augment, Drop & Swap: Improving Diversity in LLM Captions for Efficient + Music-Text Representation Learning + + +
+ Audio-text contrastive models have become a powerful approach in music +representation learning. Despite their empirical success, however, little is +known about the influence of key design choices on the quality of music-text +representations learnt through this framework. In this work, we expose these +design choices within the constraints of limited data and computation budgets, +and establish a more solid understanding of their impact grounded in empirical +observations along three axes: the choice of base encoders, the level of +curation in training data, and the use of text augmentation. We find that data +curation is the single most important factor for music-text contrastive +training in resource-constrained scenarios. Motivated by this insight, we +introduce two novel techniques, Augmented View Dropout and TextSwap, which +increase the diversity and descriptiveness of text inputs seen in training. +Through our experiments we demonstrate that these are effective at boosting +performance across different pre-training regimes, model architectures, and +downstream data distributions, without incurring higher computational costs or +requiring additional training data. + +
+
+ comment: To appear in the Proceedings of the 25th International Society for + Music Information Retrieval Conference (ISMIR 2024) +
+
+
+
+
+ + ☆ Enriching Datasets with Demographics through Large Language Models: + What's in a Name? + + +
+ Enriching datasets with demographic information, such as gender, race, and +age from names, is a critical task in fields like healthcare, public policy, +and social sciences. Such demographic insights allow for more precise and +effective engagement with target populations. Despite previous efforts +employing hidden Markov models and recurrent neural networks to predict +demographics from names, significant limitations persist: the lack of +large-scale, well-curated, unbiased, publicly available datasets, and the lack +of an approach robust across datasets. This scarcity has hindered the +development of traditional supervised learning approaches. In this paper, we +demonstrate that the zero-shot capabilities of Large Language Models (LLMs) can +perform as well as, if not better than, bespoke models trained on specialized +data. We apply these LLMs to a variety of datasets, including a real-life, +unlabelled dataset of licensed financial professionals in Hong Kong, and +critically assess the inherent demographic biases in these models. Our work not +only advances the state-of-the-art in demographic enrichment but also opens +avenues for future research in mitigating biases in LLMs. + +
+
+ comment: 8 pages, 7 Tables, 5 Figures +
+
+
+
+
+ + ☆ AraDiCE: Benchmarks for Dialectal and Cultural Capabilities in LLMs + + +
+ Arabic, with its rich diversity of dialects, remains significantly +underrepresented in Large Language Models, particularly in dialectal +variations. We address this gap by introducing seven synthetic datasets in +dialects alongside Modern Standard Arabic (MSA), created using Machine +Translation (MT) combined with human post-editing. We present AraDiCE, a +benchmark for Arabic Dialect and Cultural Evaluation. We evaluate LLMs on +dialect comprehension and generation, focusing specifically on low-resource +Arabic dialects. Additionally, we introduce the first-ever fine-grained +benchmark designed to evaluate cultural awareness across the Gulf, Egypt, and +Levant regions, providing a novel dimension to LLM evaluation. Our findings +demonstrate that while Arabic-specific models like Jais and AceGPT outperform +multilingual models on dialectal tasks, significant challenges persist in +dialect identification, generation, and translation. This work contributes ~45K +post-edited samples, a cultural benchmark, and highlights the importance of +tailored training to improve LLM performance in capturing the nuances of +diverse Arabic dialects and cultural contexts. We will release the dialectal +translation models and benchmarks curated in this study. + +
+
+ comment: Benchmarking, Culturally Informed, Large Language Models, Arabic NLP, + LLMs +
+
+
+
+
+ + ☆ NVLM: Open Frontier-Class Multimodal LLMs + + +
+ We introduce NVLM 1.0, a family of frontier-class multimodal large language +models (LLMs) that achieve state-of-the-art results on vision-language tasks, +rivaling the leading proprietary models (e.g., GPT-4o) and open-access models +(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved +text-only performance over its LLM backbone after multimodal training. In terms +of model design, we perform a comprehensive comparison between decoder-only +multimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g., +Flamingo). Based on the strengths and weaknesses of both approaches, we propose +a novel architecture that enhances both training efficiency and multimodal +reasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for +tile-based dynamic high-resolution images, which significantly boosts +performance on multimodal reasoning and OCR-related tasks. Regarding training +data, we meticulously curate and provide detailed information on our multimodal +pretraining and supervised fine-tuning datasets. Our findings indicate that +dataset quality and task diversity are more important than scale, even during +the pretraining phase, across all architectures. Notably, we develop +production-grade multimodality for the NVLM-1.0 models, enabling them to excel +in vision-language tasks while maintaining and even improving text-only +performance compared to their LLM backbones. To achieve this, we craft and +integrate a high-quality text-only dataset into multimodal training, alongside +a substantial amount of multimodal math and reasoning data, leading to enhanced +math and coding capabilities across modalities. To advance research in the +field, we are releasing the model weights and will open-source the code for the +community: https://nvlm-project.github.io/. + +
+
+
+
+
+ + ☆ Says Who? Effective Zero-Shot Annotation of Focalization + + +
+ Focalization, the perspective through which narrative is presented, is +encoded via a wide range of lexico-grammatical features and is subject to +reader interpretation. Moreover, trained readers regularly disagree on +interpretations, suggesting that this problem may be computationally +intractable. In this paper, we provide experiments to test how well +contemporary Large Language Models (LLMs) perform when annotating literary +texts for focalization mode. Despite the challenging nature of the task, LLMs +show comparable performance to trained human annotators in our experiments. We +provide a case study working with the novels of Stephen King to demonstrate the +usefulness of this approach for computational literary studies, illustrating +how focalization can be studied at scale. + +
+
+
+
+
+ + ☆ Diversify and Conquer: Diversity-Centric Data Selection with Iterative + Refinement + + +
+ Finetuning large language models on instruction data is crucial for enhancing +pre-trained knowledge and improving instruction-following capabilities. As +instruction datasets proliferate, selecting optimal data for effective training +becomes increasingly important. This work addresses the question: How can we +determine the optimal subset of data for effective training? While existing +research often emphasizes local criteria like instance quality for subset +selection, we argue that a global approach focused on data diversity is more +critical. Our method employs k-means clustering to ensure the selected subset +effectively represents the full dataset. We propose an iterative refinement +method inspired by active learning techniques to resample instances from +clusters, reassessing each cluster's importance and sampling weight in every +training iteration. This approach reduces the effect of outliers and +automatically filters out clusters containing low-quality data. Through +extensive evaluation across natural language reasoning, general world +knowledge, code and math reasoning tasks, and by fine-tuning models from +various families, we observe consistent improvements, achieving a 7% increase +over random selection and a 3.8% improvement over state-of-the-art sampling +methods. Our work highlights the significance of diversity-first sampling when +finetuning LLMs to enhance performance across a broad array of evaluation +tasks. Our code is available at +https://github.com/for-ai/iterative-data-selection. + +
+
+ comment: 21 pages, 6 figures +
+
+
+
+
+ + ☆ CoCA: Regaining Safety-awareness of Multimodal Large Language Models + with Constitutional Calibration + + +
+ The deployment of multimodal large language models (MLLMs) has demonstrated +remarkable success in engaging in conversations involving visual inputs, thanks +to the superior power of large language models (LLMs). Those MLLMs are +typically built based on the LLMs, with an image encoder to process images into +the token embedding space of the LLMs. However, the integration of visual +modality has introduced a unique vulnerability: the MLLM becomes susceptible to +malicious visual inputs and prone to generating sensitive or harmful responses, +even though the LLM has been trained on textual dataset to align with human +value. In this paper, we first raise the question: ``Do the MLLMs possess +safety-awareness against malicious image inputs?". We find that after adding a +principle that specifies the safety requirement into the input of the MLLM, the +model's safety awareness becomes boosted. This phenomenon verifies the +existence of MLLM's safety-awareness against image inputs, it is only weakened +by the modality gap. We then introduce a simple yet effective technique termed +CoCA, which amplifies the safety-awareness of the MLLM by calibrating its +output distribution. Our proposed strategy helps the model reclaim its original +safety awareness without losing its original capabilities. We verify the +effectiveness of our approach on both multimodal safety and understanding +benchmarks. + +
+
+ comment: 10 pages, COLM-2024 +
+
+
+
+
+ + ☆ CORE-Bench: Fostering the Credibility of Published Research Through a + Computational Reproducibility Agent Benchmark + + +
+ AI agents have the potential to aid users on a variety of consequential +tasks, including conducting scientific research. To spur the development of +useful agents, we need benchmarks that are challenging, but more crucially, +directly correspond to real-world tasks of interest. This paper introduces such +a benchmark, designed to measure the accuracy of AI agents in tackling a +crucial yet surprisingly challenging aspect of scientific research: +computational reproducibility. This task, fundamental to the scientific +process, involves reproducing the results of a study using the provided code +and data. We introduce CORE-Bench (Computational Reproducibility Agent +Benchmark), a benchmark consisting of 270 tasks based on 90 scientific papers +across three disciplines (computer science, social science, and medicine). +Tasks in CORE-Bench consist of three difficulty levels and include both +language-only and vision-language tasks. We provide an evaluation system to +measure the accuracy of agents in a fast and parallelizable way, saving days of +evaluation time for each run compared to a sequential implementation. We +evaluated two baseline agents: the general-purpose AutoGPT and a task-specific +agent called CORE-Agent. We tested both variants using two underlying language +models: GPT-4o and GPT-4o-mini. The best agent achieved an accuracy of 21% on +the hardest task, showing the vast scope for improvement in automating routine +scientific tasks. Having agents that can reproduce existing work is a necessary +step towards building agents that can conduct novel research and could verify +and improve the performance of other research agents. We hope that CORE-Bench +can improve the state of reproducibility and spur the development of future +research agents. + +
+
+ comment: Benchmark harness and code available at + http://github.com/siegelz/core-bench +
+
+
+
+
+ + ☆ THaMES: An End-to-End Tool for Hallucination Mitigation and Evaluation + in Large Language Models NeurIPS 2024 + + +
+ Hallucination, the generation of factually incorrect content, is a growing +challenge in Large Language Models (LLMs). Existing detection and mitigation +methods are often isolated and insufficient for domain-specific needs, lacking +a standardized pipeline. This paper introduces THaMES (Tool for Hallucination +Mitigations and EvaluationS), an integrated framework and library addressing +this gap. THaMES offers an end-to-end solution for evaluating and mitigating +hallucinations in LLMs, featuring automated test set generation, multifaceted +benchmarking, and adaptable mitigation strategies. It automates test set +creation from any corpus, ensuring high data quality, diversity, and +cost-efficiency through techniques like batch processing, weighted sampling, +and counterfactual validation. THaMES assesses a model's ability to detect and +reduce hallucinations across various tasks, including text generation and +binary classification, applying optimal mitigation strategies like In-Context +Learning (ICL), Retrieval Augmented Generation (RAG), and Parameter-Efficient +Fine-tuning (PEFT). Evaluations of state-of-the-art LLMs using a knowledge base +of academic papers, political news, and Wikipedia reveal that commercial models +like GPT-4o benefit more from RAG than ICL, while open-weight models like +Llama-3.1-8B-Instruct and Mistral-Nemo gain more from ICL. Additionally, PEFT +significantly enhances the performance of Llama-3.1-8B-Instruct in both +evaluation tasks. + +
+
+ comment: Submitted to NeurIPS 2024 SoLaR (Socially Responsible Language + Modelling Research ) Workshop +
+
+
+
+
+ + ☆ SpMis: An Investigation of Synthetic Spoken Misinformation Detection + + +
+ In recent years, speech generation technology has advanced rapidly, fueled by +generative models and large-scale training techniques. While these developments +have enabled the production of high-quality synthetic speech, they have also +raised concerns about the misuse of this technology, particularly for +generating synthetic misinformation. Current research primarily focuses on +distinguishing machine-generated speech from human-produced speech, but the +more urgent challenge is detecting misinformation within spoken content. This +task requires a thorough analysis of factors such as speaker identity, topic, +and synthesis. To address this need, we conduct an initial investigation into +synthetic spoken misinformation detection by introducing an open-source +dataset, SpMis. SpMis includes speech synthesized from over 1,000 speakers +across five common topics, utilizing state-of-the-art text-to-speech systems. +Although our results show promising detection capabilities, they also reveal +substantial challenges for practical implementation, underscoring the +importance of ongoing research in this critical area. + +
+
+ comment: Accepted in SLT 2024 +
+
+
+
+
+ + ☆ EIA: Environmental Injection Attack on Generalist Web Agents for Privacy + Leakage + + +
+ Generalist web agents have evolved rapidly and demonstrated remarkable +potential. However, there are unprecedented safety risks associated with these +them, which are nearly unexplored so far. In this work, we aim to narrow this +gap by conducting the first study on the privacy risks of generalist web agents +in adversarial environments. First, we present a threat model that discusses +the adversarial targets, constraints, and attack scenarios. Particularly, we +consider two types of adversarial targets: stealing users' specific personally +identifiable information (PII) or stealing the entire user request. To achieve +these objectives, we propose a novel attack method, termed Environmental +Injection Attack (EIA). This attack injects malicious content designed to adapt +well to different environments where the agents operate, causing them to +perform unintended actions. This work instantiates EIA specifically for the +privacy scenario. It inserts malicious web elements alongside persuasive +instructions that mislead web agents into leaking private information, and can +further leverage CSS and JavaScript features to remain stealthy. We collect 177 +actions steps that involve diverse PII categories on realistic websites from +the Mind2Web dataset, and conduct extensive experiments using one of the most +capable generalist web agent frameworks to date, SeeAct. The results +demonstrate that EIA achieves up to 70% ASR in stealing users' specific PII. +Stealing full user requests is more challenging, but a relaxed version of EIA +can still achieve 16% ASR. Despite these concerning results, it is important to +note that the attack can still be detectable through careful human inspection, +highlighting a trade-off between high autonomy and security. This leads to our +detailed discussion on the efficacy of EIA under different levels of human +supervision as well as implications on defenses for generalist web agents. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ☆ Leveraging Distillation Techniques for Document Understanding: A Case + Study with FLAN-T5 + + +
+ The surge of digital documents in various formats, including less +standardized documents such as business reports and environmental assessments, +underscores the growing importance of Document Understanding. While Large +Language Models (LLMs) have showcased prowess across diverse natural language +processing tasks, their direct application to Document Understanding remains a +challenge. Previous research has demonstrated the utility of LLMs in this +domain, yet their significant computational demands make them challenging to +deploy effectively. Additionally, proprietary Blackbox LLMs often outperform +their open-source counterparts, posing a barrier to widespread accessibility. +In this paper, we delve into the realm of document understanding, leveraging +distillation methods to harness the power of large LLMs while accommodating +computational limitations. Specifically, we present a novel approach wherein we +distill document understanding knowledge from the proprietary LLM ChatGPT into +FLAN-T5. Our methodology integrates labeling and curriculum-learning mechanisms +to facilitate efficient knowledge transfer. This work contributes to the +advancement of document understanding methodologies by offering a scalable +solution that bridges the gap between resource-intensive LLMs and practical +applications. Our findings underscore the potential of distillation techniques +in facilitating the deployment of sophisticated language models in real-world +scenarios, thereby fostering advancements in natural language processing and +document comprehension domains. + +
+
+ comment: Presented at AI@WORK-Workshop / Informatik-Festival (GI-Jahrestagung) + (Wiesbaden, Germany, 2024) +
+
+
+
+
+ + ☆ P-RAG: Progressive Retrieval Augmented Generation For Planning on + Embodied Everyday Task + + +
+ Embodied Everyday Task is a popular task in the embodied AI community, +requiring agents to make a sequence of actions based on natural language +instructions and visual observations. Traditional learning-based approaches +face two challenges. Firstly, natural language instructions often lack explicit +task planning. Secondly, extensive training is required to equip models with +knowledge of the task environment. Previous works based on Large Language Model +(LLM) either suffer from poor performance due to the lack of task-specific +knowledge or rely on ground truth as few-shot samples. To address the above +limitations, we propose a novel approach called Progressive Retrieval Augmented +Generation (P-RAG), which not only effectively leverages the powerful language +processing capabilities of LLMs but also progressively accumulates +task-specific knowledge without ground-truth. Compared to the conventional RAG +methods, which retrieve relevant information from the database in a one-shot +manner to assist generation, P-RAG introduces an iterative approach to +progressively update the database. In each iteration, P-RAG retrieves the +latest database and obtains historical information from the previous +interaction as experiential references for the current interaction. Moreover, +we also introduce a more granular retrieval scheme that not only retrieves +similar tasks but also incorporates retrieval of similar situations to provide +more valuable reference experiences. Extensive experiments reveal that P-RAG +achieves competitive results without utilizing ground truth and can even +further improve performance through self-iterations. + +
+
+
+
+
+ + ☆ Task Arithmetic for Language Expansion in Speech Translation + + +
+ Recent advances in large language models (LLMs) have gained interest in +speech-text multimodal foundation models, achieving strong performance on +instruction-based speech translation (ST). However, expanding language pairs +from an existing instruction-tuned ST system is costly due to the necessity of +re-training on a combination of new and previous datasets. We propose to expand +new language pairs by merging the model trained on new language pairs and the +existing model, using task arithmetic. We find that the direct application of +task arithmetic for ST causes the merged model to fail to follow instructions; +thus, generating translation in incorrect languages. To eliminate language +confusion, we propose an augmented task arithmetic method that merges an +additional language control model. It is trained to generate the correct target +language token following the instructions. Our experiments demonstrate that our +proposed language control model can achieve language expansion by eliminating +language confusion. In our MuST-C and CoVoST-2 experiments, it shows up to 4.66 +and 4.92 BLEU scores improvement, respectively. In addition, we demonstrate the +use of our task arithmetic framework can expand to a language pair where +neither paired ST training data nor a pre-trained ST model is available. We +first synthesize the ST system from machine translation (MT) systems via task +analogy, then merge the synthesized ST system to the existing ST model. + +
+
+
+
+
+ + ☆ Bio-Inspired Mamba: Temporal Locality and Bioplausible Learning in + Selective State Space Models + + +
+ This paper introduces Bio-Inspired Mamba (BIM), a novel online learning +framework for selective state space models that integrates biological learning +principles with the Mamba architecture. BIM combines Real-Time Recurrent +Learning (RTRL) with Spike-Timing-Dependent Plasticity (STDP)-like local +learning rules, addressing the challenges of temporal locality and biological +plausibility in training spiking neural networks. Our approach leverages the +inherent connection between backpropagation through time and STDP, offering a +computationally efficient alternative that maintains the ability to capture +long-range dependencies. We evaluate BIM on language modeling, speech +recognition, and biomedical signal analysis tasks, demonstrating competitive +performance against traditional methods while adhering to biological learning +principles. Results show improved energy efficiency and potential for +neuromorphic hardware implementation. BIM not only advances the field of +biologically plausible machine learning but also provides insights into the +mechanisms of temporal information processing in biological neural networks. + +
+
+ comment: 17 pages, 1 figure, 2 tables +
+
+
+
+
+ + ☆ Norm of Mean Contextualized Embeddings Determines their Variance + + +
+ Contextualized embeddings vary by context, even for the same token, and form +a distribution in the embedding space. To analyze this distribution, we focus +on the norm of the mean embedding and the variance of the embeddings. In this +study, we first demonstrate that these values follow the well-known formula for +variance in statistics and provide an efficient sequential computation method. +Then, by observing embeddings from intermediate layers of several Transformer +models, we found a strong trade-off relationship between the norm and the +variance: as the mean embedding becomes closer to the origin, the variance +increases. This trade-off is likely influenced by the layer normalization +mechanism used in Transformer models. Furthermore, when the sets of token +embeddings are treated as clusters, we show that the variance of the entire +embedding set can theoretically be decomposed into the within-cluster variance +and the between-cluster variance. We found experimentally that as the layers of +Transformer models deepen, the embeddings move farther from the origin, the +between-cluster variance relatively decreases, and the within-cluster variance +relatively increases. These results are consistent with existing studies on the +anisotropy of the embedding spaces across layers. + +
+
+
+
+
+ + ☆ WER We Stand: Benchmarking Urdu ASR Models + + +
+ This paper presents a comprehensive evaluation of Urdu Automatic Speech +Recognition (ASR) models. We analyze the performance of three ASR model +families: Whisper, MMS, and Seamless-M4T using Word Error Rate (WER), along +with a detailed examination of the most frequent wrong words and error types +including insertions, deletions, and substitutions. Our analysis is conducted +using two types of datasets, read speech and conversational speech. Notably, we +present the first conversational speech dataset designed for benchmarking Urdu +ASR models. We find that seamless-large outperforms other ASR models on the +read speech dataset, while whisper-large performs best on the conversational +speech dataset. Furthermore, this evaluation highlights the complexities of +assessing ASR models for low-resource languages like Urdu using quantitative +metrics alone and emphasizes the need for a robust Urdu text normalization +system. Our findings contribute valuable insights for developing robust ASR +systems for low-resource languages like Urdu. + +
+
+
+
+
+ + ☆ Linear Recency Bias During Training Improves Transformers' Fit to + Reading Times + + +
+ Recent psycholinguistic research has compared human reading times to +surprisal estimates from language models to study the factors shaping human +sentence processing difficulty. Previous studies have shown a strong fit +between surprisal values from Transformers and reading times. However, standard +Transformers work with a lossless representation of the entire previous +linguistic context, unlike models of human language processing that include +memory decay. To bridge this gap, this paper evaluates a modification of the +Transformer model that uses ALiBi (Press et al., 2022), a recency bias added to +attention scores. Surprisal estimates with ALiBi show an improved fit to human +reading times compared to a standard Transformer baseline. A subsequent +analysis of attention heads suggests that ALiBi's mixture of slopes -- which +determine the rate of memory decay in each attention head -- may play a role in +the improvement by helping models with ALiBi to track different kinds of +linguistic dependencies. + +
+
+
+
+
+ + ☆ Measuring and Enhancing Trustworthiness of LLMs in RAG through Grounded + Attributions and Learning to Refuse + + +
+ LLMs are an integral part of retrieval-augmented generation (RAG) systems. +While many studies focus on evaluating the quality of end-to-end RAG systems, +there is a lack of research on understanding the appropriateness of an LLM for +the RAG task. Thus, we introduce a new metric, Trust-Score, that provides a +holistic evaluation of the trustworthiness of LLMs in an RAG framework. We show +that various prompting methods, such as in-context learning, fail to adapt LLMs +effectively to the RAG task. Thus, we propose Trust-Align, a framework to align +LLMs for higher Trust-Score. LLaMA-3-8b, aligned with our method, significantly +outperforms open-source LLMs of comparable sizes on ASQA (up 10.7), QAMPARI (up +29.2) and ELI5 (up 14.9). We release our code at: +https://github.com/declare-lab/trust-align. + +
+
+
+
+
+ + ☆ Spontaneous Informal Speech Dataset for Punctuation Restoration + + +
+ Presently, punctuation restoration models are evaluated almost solely on +well-structured, scripted corpora. On the other hand, real-world ASR systems +and post-processing pipelines typically apply towards spontaneous speech with +significant irregularities, stutters, and deviations from perfect grammar. To +address this discrepancy, we introduce SponSpeech, a punctuation restoration +dataset derived from informal speech sources, which includes punctuation and +casing information. In addition to publicly releasing the dataset, we +contribute a filtering pipeline that can be used to generate more data. Our +filtering pipeline examines the quality of both speech audio and transcription +text. We also carefully construct a ``challenging" test set, aimed at +evaluating models' ability to leverage audio information to predict otherwise +grammatically ambiguous punctuation. SponSpeech is available at +https://github.com/GitHubAccountAnonymous/PR, along with all code for dataset +building and model runs. + +
+
+ comment: 8 pages, 7 tables, 1 figure, Recognition Technologies, Inc. Technical + Report +
+
+
+
+
+ + ☆ LLM-as-a-Judge & Reward Model: What They Can and Cannot Do + + +
+ LLM-as-a-Judge and reward models are widely used alternatives of +multiple-choice questions or human annotators for large language model (LLM) +evaluation. Their efficacy shines in evaluating long-form responses, serving a +critical role as evaluators of leaderboards and as proxies to align LLMs via +reinforcement learning. However, despite their popularity, their effectiveness +outside of English remains largely unexplored. In this paper, we conduct a +comprehensive analysis on automated evaluators, reporting key findings on their +behavior in a non-English environment. First, we discover that English +evaluation capabilities significantly influence language-specific capabilities, +often more than the language proficiency itself, enabling evaluators trained in +English to easily transfer their skills to other languages. Second, we identify +critical shortcomings, where LLMs fail to detect and penalize errors, such as +factual inaccuracies, cultural misrepresentations, and the presence of unwanted +language. Finally, we release Kudge, the first non-English meta-evaluation +dataset containing 5,012 human annotations in Korean. + +
+
+ comment: preprint +
+
+
+
+
+ + ☆ Evaluating the Impact of Compression Techniques on Task-Specific + Performance of Large Language Models + + +
+ Large language models (LLMs) offer powerful capabilities but incur +substantial computational costs, driving the need for efficient compression +techniques. This study evaluates the impact of popular compression methods - +Magnitude Pruning, SparseGPT, and Wanda - on the LLaMA-2-7B model, focusing on +the trade-offs between model size reduction, downstream task performance, and +the role of calibration data. Our findings reveal that while SparseGPT and +Wanda preserve perplexity even at 50% sparsity, they suffer significant +degradation on downstream tasks, highlighting the inadequacy of perplexity as +the sole evaluation metric. To address this, we introduce Jensen-Shannon (JS) +Divergence as a more comprehensive metric that captures nuanced changes in +model behavior post-compression. We further demonstrate that task-specific +calibration data significantly enhances the downstream performance of +compressed models compared to general calibration data. This research +underscores the necessity for diverse evaluation metrics and careful +calibration data selection to fully understand the complexities of LLM +compression and its implications for practical applications. + +
+
+
+
+
+ + ☆ Exploring ChatGPT-based Augmentation Strategies for Contrastive + Aspect-based Sentiment Analysis + + +
+ Aspect-based sentiment analysis (ABSA) involves identifying sentiment towards +specific aspect terms in a sentence and allows us to uncover nuanced +perspectives and attitudes on particular aspects of a product, service, or +topic. However, the scarcity of labeled data poses a significant challenge to +training high-quality models. To address this issue, we explore the potential +of data augmentation using ChatGPT, a well-performing large language model +(LLM), to enhance the sentiment classification performance towards aspect +terms. Specifically, we explore three data augmentation strategies based on +ChatGPT: context-focused, aspect-focused, and context-aspect data augmentation +techniques. Context-focused data augmentation focuses on changing the word +expression of context words in the sentence while keeping aspect terms +unchanged. In contrast, aspect-focused data augmentation aims to change aspect +terms but keep context words unchanged. Context-Aspect data augmentation +integrates the above two data augmentations to generate augmented samples. +Furthermore, we incorporate contrastive learning into the ABSA tasks to improve +performance. Extensive experiments show that all three data augmentation +techniques lead to performance improvements, with the context-aspect data +augmentation strategy performing best and surpassing the performance of the +baseline models. + +
+
+ comment: 8 pages, 3 figures +
+
+
+
+
+ + ☆ Self-Evolutionary Large Language Models through Uncertainty-Enhanced + Preference Optimization + + +
+ Iterative preference optimization has recently become one of the de-facto +training paradigms for large language models (LLMs), but the performance is +still underwhelming due to too much noisy preference data yielded in the loop. +To combat this issue, we present an \textbf{U}ncertainty-enhanced +\textbf{P}reference \textbf{O}ptimization (UPO) framework to make the LLM +self-evolve with reliable feedback. The key idea is mitigating the noisy +preference data derived from the current policy and reward models by performing +pair-wise uncertainty estimation and judiciously reliable feedback sampling. To +reach this goal, we thus introduce an estimator model, which incorporates Monte +Carlo (MC) dropout in Bayesian neural network (BNN) to perform uncertainty +estimation for the preference data derived from the LLM policy. Compared to the +existing methods that directly filter generated responses based on the reward +score, the estimator focuses on the model uncertainty in a pair-wise manner and +effectively bypasses the confirmation bias problem of the reward model. +Additionally, we also propose an uncertainty-enhanced self-evolution algorithm +to improve the robustness of preference optimization and encourage the LLM to +generate responses with both high reward and certainty. Extensive experiments +over multiple benchmarks demonstrate that our framework substantially +alleviates the noisy problem and improves the performance of iterative +preference optimization. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Capturing Differences in Character Representations Between Communities: + An Initial Study with Fandom + + +
+ Sociolinguistic theories have highlighted how narratives are often retold, +co-constructed and reconceptualized in collaborative settings. This working +paper focuses on the re-interpretation of characters, an integral part of the +narrative story-world, and attempts to study how this may be computationally +compared between online communities. Using online fandom - a highly communal +phenomenon that has been largely studied qualitatively - as data, computational +methods were applied to explore shifts in character representations between two +communities and the original text. Specifically, text from the Harry Potter +novels, r/HarryPotter subreddit, and fanfiction on Archive of Our Own were +analyzed for changes in character mentions, centrality measures from +co-occurrence networks, and semantic associations. While fandom elevates +secondary characters as found in past work, the two fan communities prioritize +different subsets of characters. Word embedding tests reveal starkly different +associations of the same characters between communities on the gendered +concepts of femininity/masculinity, cruelty, and beauty. Furthermore, +fanfiction descriptions of a male character analyzed between romance pairings +scored higher for feminine-coded characteristics in male-male romance, matching +past qualitative theorizing. The results high-light the potential for +computational methods to assist in capturing the re-conceptualization of +narrative elements across communities and in supporting qualitative research on +fandom. + +
+
+ comment: Accepted and presented as a working paper in SBP-BRiMS 2024 +
+
+
+
+
+ + ☆ SAGED: A Holistic Bias-Benchmarking Pipeline for Language Models with + Customisable Fairness Calibration COLING 2025 + + +
+ The development of unbiased large language models is widely recognized as +crucial, yet existing benchmarks fall short in detecting biases due to limited +scope, contamination, and lack of a fairness baseline. SAGED(-Bias) is the +first holistic benchmarking pipeline to address these problems. The pipeline +encompasses five core stages: scraping materials, assembling benchmarks, +generating responses, extracting numeric features, and diagnosing with +disparity metrics. SAGED includes metrics for max disparity, such as impact +ratio, and bias concentration, such as Max Z-scores. Noticing that assessment +tool bias and contextual bias in prompts can distort evaluation, SAGED +implements counterfactual branching and baseline calibration for mitigation. +For demonstration, we use SAGED on G20 Countries with popular 8b-level models +including Gemma2, Llama3.1, Mistral, and Qwen2. With sentiment analysis, we +find that while Mistral and Qwen2 show lower max disparity and higher bias +concentration than Gemma2 and Llama3.1, all models are notably biased against +countries like Russia and (except for Qwen2) China. With further experiments to +have models role-playing U.S. (vice-/former-) presidents, we see bias amplifies +and shifts in heterogeneous directions. Moreover, we see Qwen2 and Mistral not +engage in role-playing, while Llama3.1 and Gemma2 role-play Trump notably more +intensively than Biden and Harris, indicating role-playing performance bias in +these models. + +
+
+ comment: Submitted to COLING 2025 Main Conference +
+
+
+
+
+ + ☆ Improving the Efficiency of Visually Augmented Language Models + + +
+ Despite the impressive performance of autoregressive Language Models (LM) it +has been shown that due to reporting bias, LMs lack visual knowledge, i.e. they +do not know much about the visual world and its properties. To augment LMs with +visual knowledge, existing solutions often rely on explicit images, requiring +time-consuming retrieval or image generation systems. This paper shows that +explicit images are not necessary to visually augment an LM. Instead, we use +visually-grounded text representations obtained from the well-known CLIP +multimodal system. For a fair comparison, we modify VALM, a visually-augmented +LM which uses image retrieval and representation, to work directly with +visually-grounded text representations. We name this new model BLIND-VALM. We +show that BLIND-VALM performs on par with VALM for Visual Language +Understanding (VLU), Natural Language Understanding (NLU) and Language Modeling +tasks, despite being significantly more efficient and simpler. We also show +that scaling up our model within the compute budget of VALM, either increasing +the model or pre-training corpus size, we outperform VALM for all the +evaluation tasks. + +
+
+
+
+
+ + ☆ Reasoning Graph Enhanced Exemplars Retrieval for In-Context Learning + + +
+ Large language models(LLMs) have exhibited remarkable few-shot learning +capabilities and unified the paradigm of NLP tasks through the in-context +learning(ICL) technique. Despite the success of ICL, the quality of the +exemplar demonstrations can significantly influence the LLM's performance. +Existing exemplar selection methods mainly focus on the semantic similarity +between queries and candidate exemplars. On the other hand, the logical +connections between reasoning steps can be beneficial to depict the +problem-solving process as well. In this paper, we proposes a novel method +named Reasoning Graph-enhanced Exemplar Retrieval(RGER). RGER first quires LLM +to generate an initial response, then expresses intermediate problem-solving +steps to a graph structure. After that, it employs graph kernel to select +exemplars with semantic and structural similarity. Extensive experiments +demonstrate the structural relationship is helpful to the alignment of queries +and candidate exemplars. The efficacy of RGER on math and logit reasoning tasks +showcases its superiority over state-of-the-art retrieval-based approaches. Our +code is released at https://github.com/Yukang-Lin/RGER. + +
+
+
+
+
+ + ☆ Semformer: Transformer Language Models with Semantic Planning + + +
+ Next-token prediction serves as the dominant component in current neural +language models. During the training phase, the model employs teacher forcing, +which predicts tokens based on all preceding ground truth tokens. However, this +approach has been found to create shortcuts, utilizing the revealed prefix to +spuriously fit future tokens, potentially compromising the accuracy of the +next-token predictor. In this paper, we introduce Semformer, a novel method of +training a Transformer language model that explicitly models the semantic +planning of response. Specifically, we incorporate a sequence of planning +tokens into the prefix, guiding the planning token representations to predict +the latent semantic representations of the response, which are induced by an +autoencoder. In a minimal planning task (i.e., graph path-finding), our model +exhibits near-perfect performance and effectively mitigates shortcut learning, +a feat that standard training methods and baseline models have been unable to +accomplish. Furthermore, we pretrain Semformer from scratch with 125M +parameters, demonstrating its efficacy through measures of perplexity, +in-context learning, and fine-tuning on summarization tasks. + +
+
+
+
+
+ + ☆ Promptriever: Instruction-Trained Retrievers Can Be Prompted Like + Language Models + + +
+ Instruction-tuned language models (LM) are able to respond to imperative +commands, providing a more natural user interface compared to their base +counterparts. In this work, we present Promptriever, the first retrieval model +able to be prompted like an LM. To train Promptriever, we curate and release a +new instance-level instruction training set from MS MARCO, spanning nearly 500k +instances. Promptriever not only achieves strong performance on standard +retrieval tasks, but also follows instructions. We observe: (1) large gains +(reaching SoTA) on following detailed relevance instructions (+14.3 p-MRR / ++3.1 nDCG on FollowIR), (2) significantly increased robustness to lexical +choices/phrasing in the query+instruction (+12.9 Robustness@10 on InstructIR), +and (3) the ability to perform hyperparameter search via prompting to reliably +improve retrieval performance (+1.4 average increase on BEIR). Promptriever +demonstrates that retrieval models can be controlled with prompts on a +per-query basis, setting the stage for future work aligning LM prompting +techniques with information retrieval. + +
+
+
+
+
+ + ☆ Strategic Insights in Human and Large Language Model Tactics at Word + Guessing Games ACL 2024 + + +
+ At the beginning of 2022, a simplistic word-guessing game took the world by +storm and was further adapted to many languages beyond the original English +version. In this paper, we examine the strategies of daily word-guessing game +players that have evolved during a period of over two years. A survey gathered +from 25% of frequent players reveals their strategies and motivations for +continuing the daily journey. We also explore the capability of several popular +open-access large language model systems and open-source models at +comprehending and playing the game in two different languages. Results +highlight the struggles of certain models to maintain correct guess length and +generate repetitions, as well as hallucinations of non-existent words and +inflections. + +
+
+ comment: Published in the 4th Wordplay: When Language Meets Games Workshop @ + ACL 2024 +
+
+
+
+
+ + ☆ KVPruner: Structural Pruning for Faster and Memory-Efficient Large + Language Models + + +
+ The bottleneck associated with the key-value(KV) cache presents a significant +challenge during the inference processes of large language models. While depth +pruning accelerates inference, it requires extensive recovery training, which +can take up to two weeks. On the other hand, width pruning retains much of the +performance but offers slight speed gains. To tackle these challenges, we +propose KVPruner to improve model efficiency while maintaining performance. Our +method uses global perplexity-based analysis to determine the importance ratio +for each block and provides multiple strategies to prune non-essential KV +channels within blocks. Compared to the original model, KVPruner reduces +runtime memory usage by 50% and boosts throughput by over 35%. Additionally, +our method requires only two hours of LoRA fine-tuning on small datasets to +recover most of the performance. + +
+
+
+
+
+ + ☆ Large Language Models are Good Multi-lingual Learners : When LLMs Meet + Cross-lingual Prompts + + +
+ With the advent of Large Language Models (LLMs), generating rule-based data +for real-world applications has become more accessible. Due to the inherent +ambiguity of natural language and the complexity of rule sets, especially in +long contexts, LLMs often struggle to follow all specified rules, frequently +omitting at least one. To enhance the reasoning and understanding of LLMs on +long and complex contexts, we propose a novel prompting strategy Multi-Lingual +Prompt, namely MLPrompt, which automatically translates the error-prone rule +that an LLM struggles to follow into another language, thus drawing greater +attention to it. Experimental results on public datasets across various tasks +have shown MLPrompt can outperform state-of-the-art prompting methods such as +Chain of Thought, Tree of Thought, and Self-Consistency. Additionally, we +introduce a framework integrating MLPrompt with an auto-checking mechanism for +structured data generation, with a specific case study in text-to-MIP +instances. Further, we extend the proposed framework for text-to-SQL to +demonstrate its generation ability towards structured data synthesis. + +
+
+
+
+
+ + ☆ A Comprehensive Evaluation of Quantized Instruction-Tuned Large Language + Models: An Experimental Analysis up to 405B + + +
+ Prior research works have evaluated quantized LLMs using limited metrics such +as perplexity or a few basic knowledge tasks and old datasets. Additionally, +recent large-scale models such as Llama 3.1 with up to 405B have not been +thoroughly examined. This paper evaluates the performance of instruction-tuned +LLMs across various quantization methods (GPTQ, AWQ, SmoothQuant, and FP8) on +models ranging from 7B to 405B. Using 13 benchmarks, we assess performance +across six task types: commonsense Q\&A, knowledge and language understanding, +instruction following, hallucination detection, mathematics, and dialogue. Our +key findings reveal that (1) quantizing a larger LLM to a similar size as a +smaller FP16 LLM generally performs better across most benchmarks, except for +hallucination detection and instruction following; (2) performance varies +significantly with different quantization methods, model size, and bit-width, +with weight-only methods often yielding better results in larger models; (3) +task difficulty does not significantly impact accuracy degradation due to +quantization; and (4) the MT-Bench evaluation method has limited discriminatory +power among recent high-performing LLMs. + +
+
+ comment: 11 pages, 1 figure +
+
+
+
+
+ + ♻ ☆ MMLU-Pro+: Evaluating Higher-Order Reasoning and Shortcut Learning in + LLMs + + +
+ Existing benchmarks for large language models (LLMs) increasingly struggle to +differentiate between top-performing models, underscoring the need for more +challenging evaluation frameworks. We introduce MMLU-Pro+, an enhanced +benchmark building upon MMLU-Pro to assess shortcut learning and higher-order +reasoning in LLMs. By incorporating questions with multiple correct answers +across diverse domains, MMLU-Pro+ tests LLMs' ability to engage in complex +reasoning and resist simplistic problem-solving strategies. Our results show +that MMLU-Pro+ maintains MMLU-Pro's difficulty while providing a more rigorous +test of model discrimination, particularly in multi-correct answer scenarios. +We introduce novel metrics like shortcut selection ratio and correct pair +identification ratio, offering deeper insights into model behavior and +anchoring bias. Evaluations of six state-of-the-art LLMs reveal significant +performance gaps, highlighting variations in reasoning abilities and bias +susceptibility. We release the dataset and evaluation codes at +\url{https://github.com/asgsaeid/mmlu-pro-plus}. + +
+
+
+
+
+ + ♻ ☆ CoT Rerailer: Enhancing the Reliability of Large Language Models in + Complex Reasoning Tasks through Error Detection and Correction + + +
+ Chain-of-Thought (CoT) prompting enhances Large Language Models (LLMs) +complex reasoning abilities by generating intermediate steps. However, these +steps can introduce hallucinations and accumulate errors. We propose the CoT +Rerailer to address these challenges, employing self-consistency and +multi-agent debate systems to identify and rectify errors in the reasoning +process. The CoT Rerailer first selects the most logically correct Reasoning +Path (RP) using consistency checks and critical evaluation by automated agents. +It then engages a multi-agent debate system to propose and validate corrections +to ensure the generation of an error-free intermediate logical path. The +corrected steps are then used to generate a revised reasoning chain to further +reduce hallucinations and enhance answer quality. We demonstrate the +effectiveness of our approach across diverse question-answering datasets in +various knowledge domains. The CoT Rerailer enhances the reliability of +LLM-generated reasoning, contributing to more trustworthy AI driven +decision-making processes. + +
+
+
+
+
+ + ♻ ☆ Fine-grained LLM Agent: Pinpointing and Refining Large Language Models + via Fine-Grained Actionable Feedback NAACL 2024 + + +
+ Recent large language models (LLM) are leveraging human feedback to improve +their generation quality. However, human feedback is costly to obtain, +especially during inference. In this work, we propose Fine-grained LLM agent, +an inference time optimization method to refine LLM's output. The core idea is +to use a learned fine-grained feedback model to pinpoint defects and guide LLM +to refine them iteratively. Using original LLM as a proposal of edits, +Fine-grained LLM agent searches for defect-less text via simulated annealing, +trading off the exploration and exploitation. We conduct experiments on three +text generation tasks, including machine translation, long-form question +answering (QA), and topical summarization. Fine-grained LLM agent consistently +outperforms all baseline approaches, achieving improvements up to 1.7 MetricX +points on translation tasks, 8.1 ROUGE-L on ASQA, 2.2 ROUGE-L on topical +summarization. + +
+
+ comment: Accepted to NAACL 2024 +
+
+
+
+
+ + ♻ ☆ Unmasking the Imposters: How Censorship and Domain Adaptation Affect the + Detection of Machine-Generated Tweets + + +
+ The rapid development of large language models (LLMs) has significantly +improved the generation of fluent and convincing text, raising concerns about +their potential misuse on social media platforms. We present a comprehensive +methodology for creating nine Twitter datasets to examine the generative +capabilities of four prominent LLMs: Llama 3, Mistral, Qwen2, and GPT4o. These +datasets encompass four censored and five uncensored model configurations, +including 7B and 8B parameter base-instruction models of the three open-source +LLMs. Additionally, we perform a data quality analysis to assess the +characteristics of textual outputs from human, "censored," and "uncensored" +models, employing semantic meaning, lexical richness, structural patterns, +content characteristics, and detector performance metrics to identify +differences and similarities. Our evaluation demonstrates that "uncensored" +models significantly undermine the effectiveness of automated detection +methods. This study addresses a critical gap by exploring smaller open-source +models and the ramifications of "uncensoring," providing valuable insights into +how domain adaptation and content moderation strategies influence both the +detectability and structural characteristics of machine-generated text. + +
+
+
+
+
+ + ♻ ☆ Revisiting Cosine Similarity via Normalized ICA-transformed Embeddings + + +
+ Cosine similarity is widely used to measure the similarity between two +embeddings, while interpretations based on angle and correlation coefficient +are common. In this study, we focus on the interpretable axes of embeddings +transformed by Independent Component Analysis (ICA), and propose a novel +interpretation of cosine similarity as the sum of semantic similarities over +axes. The normalized ICA-transformed embeddings exhibit sparsity, enhancing the +interpretability of each axis, and the semantic similarity defined by the +product of the components represents the shared meaning between the two +embeddings along each axis. The effectiveness of this approach is demonstrated +through intuitive numerical examples and thorough numerical experiments. By +deriving the probability distributions that govern each component and the +product of components, we propose a method for selecting statistically +significant axes. + +
+
+
+
+
+ + ♻ ☆ Image Hijacks: Adversarial Images can Control Generative Models at + Runtime + + +
+ Are foundation models secure against malicious actors? In this work, we focus +on the image input to a vision-language model (VLM). We discover image hijacks, +adversarial images that control the behaviour of VLMs at inference time, and +introduce the general Behaviour Matching algorithm for training image hijacks. +From this, we derive the Prompt Matching method, allowing us to train hijacks +matching the behaviour of an arbitrary user-defined text prompt (e.g. 'the +Eiffel Tower is now located in Rome') using a generic, off-the-shelf dataset +unrelated to our choice of prompt. We use Behaviour Matching to craft hijacks +for four types of attack, forcing VLMs to generate outputs of the adversary's +choice, leak information from their context window, override their safety +training, and believe false statements. We study these attacks against LLaVA, a +state-of-the-art VLM based on CLIP and LLaMA-2, and find that all attack types +achieve a success rate of over 80%. Moreover, our attacks are automated and +require only small image perturbations. + +
+
+ comment: Project page at https://image-hijacks.github.io +
+
+
+
+
+ + ♻ ☆ Integrating Knowledge Retrieval and Large Language Models for Clinical + Report Correction + + +
+ This study proposes an approach for error correction in radiology reports, +leveraging large language models (LLMs) and retrieval-augmented generation +(RAG) techniques. The proposed framework employs a novel internal+external +retrieval mechanism to extract relevant medical entities and relations from the +report of interest and an external knowledge source. A three-stage inference +process is introduced, decomposing the task into error detection, localization, +and correction subtasks, which enhances the explainability and performance of +the system. The effectiveness of the approach is evaluated using a benchmark +dataset created by corrupting real-world radiology reports with realistic +errors, guided by domain experts. Experimental results demonstrate the benefits +of the proposed methods, with the combination of internal and external +retrieval significantly improving the accuracy of error detection, +localization, and correction across various state-of-the-art LLMs. The findings +contribute to the development of more robust and reliable error correction +systems for clinical documentation. + +
+
+ comment: v2 +
+
+
+
+
+ + ♻ ☆ Larger Language Models Don't Care How You Think: Why Chain-of-Thought + Prompting Fails in Subjective Tasks + + +
+ In-Context Learning (ICL) in Large Language Models (LLM) has emerged as the +dominant technique for performing natural language tasks, as it does not +require updating the model parameters with gradient-based methods. ICL promises +to "adapt" the LLM to perform the present task at a competitive or +state-of-the-art level at a fraction of the computational cost. ICL can be +augmented by incorporating the reasoning process to arrive at the final label +explicitly in the prompt, a technique called Chain-of-Thought (CoT) prompting. +However, recent work has found that ICL relies mostly on the retrieval of task +priors and less so on "learning" to perform tasks, especially for complex +subjective domains like emotion and morality, where priors ossify posterior +predictions. In this work, we examine whether "enabling" reasoning also creates +the same behavior in LLMs, wherein the format of CoT retrieves reasoning priors +that remain relatively unchanged despite the evidence in the prompt. We find +that, surprisingly, CoT indeed suffers from the same posterior collapse as ICL +for larger language models. Code is avalaible at +https://github.com/gchochla/cot-priors. + +
+
+ comment: 5 pages, 2 figures, 1 table. arXiv admin note: text overlap with + arXiv:2403.17125 +
+
+
+
+
+ + ♻ ☆ Dated Data: Tracing Knowledge Cutoffs in Large Language Models + + +
+ Released Large Language Models (LLMs) are often paired with a claimed +knowledge cutoff date, or the dates at which training data was gathered. Such +information is crucial for applications where the LLM must provide up to date +information. However, this statement only scratches the surface: do all +resources in the training data share the same knowledge cutoff date? Does the +model's demonstrated knowledge for these subsets closely align to their cutoff +dates? In this work, we define the notion of an effective cutoff. This is +distinct from the LLM designer reported cutoff and applies separately to +sub-resources and topics. We propose a simple approach to estimate effective +cutoffs on the resource-level temporal alignment of an LLM by probing across +versions of the data. Using this analysis, we find that effective cutoffs often +differ from reported cutoffs. To understand the root cause of this observation, +we conduct a direct large-scale analysis on open pre-training datasets. Our +analysis reveals two reasons for these inconsistencies: (1) temporal biases of +CommonCrawl data due to non-trivial amounts of old data in new dumps and (2) +complications in LLM deduplication schemes involving semantic duplicates and +lexical near-duplicates. Overall, our results show that knowledge cutoffs are +not as simple as they have seemed and that care must be taken both by LLM +dataset curators as well as practitioners who seek to use information from +these models. + +
+
+
+
+
+ + ♻ ☆ Bridging Social Media and Search Engines: Dredge Words and the Detection + of Unreliable Domains + + +
+ Proactive content moderation requires platforms to rapidly and continuously +evaluate the credibility of websites. Leveraging the direct and indirect paths +users follow to unreliable websites, we develop a website credibility +classification and discovery system that integrates both webgraph and +large-scale social media contexts. We additionally introduce the concept of +dredge words, terms or phrases for which unreliable domains rank highly on +search engines, and provide the first exploration of their usage on social +media. Our graph neural networks that combine webgraph and social media +contexts generate to state-of-the-art results in website credibility +classification and significantly improves the top-k identification of +unreliable domains. Additionally, we release a novel dataset of dredge words, +highlighting their strong connections to both social media and online commerce +platforms. + +
+
+
+
+
+ + ♻ ☆ Do Language Models Exhibit Human-like Structural Priming Effects? ACL + + +
+ We explore which linguistic factors -- at the sentence and token level -- +play an important role in influencing language model predictions, and +investigate whether these are reflective of results found in humans and human +corpora (Gries and Kootstra, 2017). We make use of the structural priming +paradigm, where recent exposure to a structure facilitates processing of the +same structure. We don't only investigate whether, but also where priming +effects occur, and what factors predict them. We show that these effects can be +explained via the inverse frequency effect, known in human priming, where rarer +elements within a prime increase priming effects, as well as lexical dependence +between prime and target. Our results provide an important piece in the puzzle +of understanding how properties within their context affect structural +prediction in language models. + +
+
+ comment: ACL Findings 2024 +
+
+
+
+
+ + ♻ ☆ ExploreSelf: Fostering User-driven Exploration and Reflection on + Personal Challenges with Adaptive Guidance by Large Language Models + + +
+ Expressing stressful experiences in words is proven to improve mental and +physical health, but individuals often disengage with writing interventions as +they struggle to organize their thoughts and emotions. Reflective prompts have +been used to provide direction, and large language models (LLMs) have +demonstrated the potential to provide tailored guidance. Current systems often +limit users' flexibility to direct their reflections. We thus present +ExploreSelf, an LLM-driven application designed to empower users to control +their reflective journey. ExploreSelf allows users to receive adaptive support +through dynamically generated questions. Through an exploratory study with 19 +participants, we examine how participants explore and reflect on personal +challenges using ExploreSelf. Our findings demonstrate that participants valued +the balance between guided support and freedom to control their reflective +journey, leading to deeper engagement and insight. Building on our findings, we +discuss implications for designing LLM-driven tools that promote user +empowerment through effective reflective practices. + +
+
+ comment: 17 pages excluding reference and appendix +
+
+
+
+
+ + ♻ ☆ EmPO: Emotion Grounding for Empathetic Response Generation through + Preference Optimization EMNLP + + +
+ Empathetic response generation is a desirable aspect of conversational +agents, crucial for facilitating engaging and emotionally intelligent +multi-turn conversations between humans and machines. Leveraging large language +models for this task has shown promising results, yet challenges persist in +ensuring both the empathetic quality of the responses and retention of the +generalization performance of the models. We propose a novel approach where we +construct theory-driven preference datasets based on emotion grounding and use +them to align LLMs with preference optimization algorithms to address these +challenges. To evaluate empathetic response generation, we employ the +EmpatheticDialogues dataset, assessing empathy with the diff-Epitome and +BERTscore metrics and with multi-dimensional human evaluation. Additionally, we +measure diversity and emotional valence using feature-based methods. We also +evaluate the impact of training on the generalization performance using the +MMLU benchmark and tasks from the Open LLM Leaderboard. The results show that +LLMs can be aligned for empathetic response generation by preference +optimization while retaining their general performance and that emotion +grounding can guide preference dataset creation. We make all datasets, source +code, and models publicly available. https://github.com/justtherightsize/empo + +
+
+ comment: v02, 8 pages long paper, EMNLP ACL style +
+
+
+
+
+ + ♻ ☆ AI-Assisted Human Evaluation of Machine Translation + + +
+ Annually, research teams spend large amounts of money to evaluate the quality +of machine translation systems (WMT, inter alia). This is expensive because it +requires a lot of expert human labor. The recently adopted annotation protocol, +Error Span Annotation (ESA), has annotators marking erroneous parts of the +translation and then assigning a final score. A lot of the annotator time is +spent on scanning the translation for possible errors. In our work, we help the +annotators by pre-filling the error annotations with recall-oriented automatic +quality estimation. With this AI assistance, we obtain annotations at the same +quality level while cutting down the time per span annotation by half +(71s/error span $\rightarrow$ 31s/error span). The biggest advantage of +ESA$^\mathrm{AI}$ protocol is an accurate priming of annotators (pre-filled +error spans) before they assign the final score. This also alleviates a +potential automation bias, which we confirm to be low. In addition, the +annotation budget can be reduced by almost 25\% with filtering of examples that +the AI deems to be very likely to be correct. + +
+
+
+
+
+ + ♻ ☆ Can Many-Shot In-Context Learning Help LLMs as Evaluators? A Preliminary + Empirical Study + + +
+ Utilizing Large Language Models (LLMs) as evaluators for evaluating the +performance of LLMs has recently garnered attention. However, this kind of +evaluation approach is affected by potential biases in LLMs, raising concerns +about the accuracy and reliability of the evaluation results. To mitigate this +issue, we propose and study two many-shot ICL prompts, which rely on two +versions of many-shot ICL prompt templates for helping LLM evaluators to +mitigate the potential biases in LLMs, \textbf{M}any-\textbf{S}hot +\textbf{w}ith \textbf{R}eference (\textbf{MSwR}) and +\textbf{M}any-\textbf{S}hot with\textbf{o}ut \textbf{R}eference +(\textbf{MSoR}). Concretely, the former utilizes in-context examples with +model-generated rationales as guidance, and the latter without. Based on the +designed prompts, we investigate the impact of scaling the number of in-context +examples on the consistency and quality of the evaluation results. Experimental +results show that advanced LLMs, such as GPT-4o, perform better in the +many-shot regime than in the zero-shot regime. Furthermore, we reveal the +symbol bias hidden in the selection bias of LLMs and propose a simple yet +effective approach to mitigate the bias. Experimental results further verify +the effectiveness of the symbol bias mitigation approach. + +
+
+ comment: work in progress +
+
+
+
+
+ + ♻ ☆ A Review of Prominent Paradigms for LLM-Based Agents: Tool Use + (Including RAG), Planning, and Feedback Learning + + +
+ Tool use, planning, and feedback learning are currently three prominent +paradigms for developing Large Language Model (LLM)-based agents across various +tasks. Although numerous frameworks have been devised for each paradigm, their +intricate workflows and inconsistent taxonomy create challenges in +understanding and reviewing the frameworks across different paradigms. This +survey introduces a unified taxonomy to systematically review and discuss these +frameworks. Specifically, 1) the taxonomy defines environments/tasks, common +LLM-profiled roles (policy models, evaluators, and dynamic models), and +universally applicable workflows found in prior work, and 2) it enables a +comparison of key perspectives on LMPR implementations and workflow usage +across different agent paradigms. + +
+
+ comment: Under Review +
+
+
+
+
+ + ♻ ☆ Diversifying the Expert Knowledge for Task-Agnostic Pruning in Sparse + Mixture-of-Experts + + +
+ By increasing model parameters but activating them sparsely when performing a +task, the use of Mixture-of-Experts (MoE) architecture significantly improves +the performance of Large Language Models (LLMs) without increasing the +inference cost. However, the memory consumption due to the growing number of +experts presents a challenge to the deployment of these models in many real +world settings. Our empirical study reveals that some experts encode redundant +knowledge during pre-training. We thus propose a method of grouping and pruning +similar experts to improve the model's parameter efficiency. We validate the +effectiveness of our method by pruning three state-of-the-art MoE +architectures, including Mixtral, Deepseek-MoE, and Qwen. The evaluation shows +that our method outperforms other model pruning methods on a range of natural +language tasks. We will release our code to facilitate future research. + +
+
+ comment: 13pages, 6 figures +
+
+
+
+
+ + ♻ ☆ The Invalsi Benchmarks: measuring Linguistic and Mathematical + understanding of Large Language Models in Italian + + +
+ While Italian is a high-resource language, there are few Italian-native +benchmarks to evaluate generative Large Language Models (LLMs) in this +language. This work presents three new benchmarks: Invalsi MATE to evaluate +models performance on mathematical understanding in Italian, Invalsi ITA to +evaluate language understanding in Italian and Olimpiadi MATE for more complex +mathematical understanding. + The first two benchmarks are based on the Invalsi tests, which are +administered to students of age between 6 and 18 within the Italian school +system and have been validated by several experts in teaching and pedagogy, the +third one comes from the Italian high school math Olympics. + We evaluate 10 powerful language models on these benchmarks and find that +they are bound by 71% accuracy on Invasli MATE, achieved by Llama 3.1 70b +instruct and by 88% on Invalsi ITA. For both Invalsi MATE and Invalsi ITA we +compare LLMs with the average performance of Italian students to show that +Llama 3.1 is the only one to outperform them on Invalsi MATE while most models +do so on Invalsi ITA, we then show that Olimpiadi MATE is more challenging than +Invalsi MATE and the highest accuracy, achieved by Llama 3.1 405b instruct is +45%. + We will make data and evaluation code openly available upon acceptance of the +paper. + +
+
+
+
+
+ + ♻ ☆ Schrodinger's Memory: Large Language Models + + +
+ Memory is the foundation of all human activities; without memory, it would be +nearly impossible for people to perform any task in daily life. With the +development of Large Language Models (LLMs), their language capabilities are +becoming increasingly comparable to those of humans. But do LLMs have memory? +Based on current performance, LLMs do appear to exhibit memory. So, what is the +underlying mechanism of this memory? Previous research has lacked a deep +exploration of LLMs' memory capabilities and the underlying theory. In this +paper, we use Universal Approximation Theorem (UAT) to explain the memory +mechanism in LLMs. We also conduct experiments to verify the memory +capabilities of various LLMs, proposing a new method to assess their abilities +based on these memory ability. We argue that LLM memory operates like +Schr\"odinger's memory, meaning that it only becomes observable when a specific +memory is queried. We can only determine if the model retains a memory based on +its output in response to the query; otherwise, it remains indeterminate. +Finally, we expand on this concept by comparing the memory capabilities of the +human brain and LLMs, highlighting the similarities and differences in their +operational mechanisms. + +
+
+
+
+
+ + ♻ ☆ Unveiling Entity-Level Unlearning for Large Language Models: A + Comprehensive Analysis + + +
+ Large language model unlearning has garnered increasing attention due to its +potential to address security and privacy concerns, leading to extensive +research in the field. However, much of this research has concentrated on +instance-level unlearning, specifically targeting the removal of predefined +instances containing sensitive content. This focus has left a significant gap +in the exploration of full entity-level unlearning, which is critical in +real-world scenarios such as copyright protection. To this end, we propose a +novel task of Entity-level unlearning, which aims to erase entity-related +knowledge from the target model completely. To thoroughly investigate this +task, we systematically evaluate trending unlearning algorithms, revealing that +current methods struggle to achieve effective entity-level unlearning. Then, we +further explore the factors that influence the performance of the unlearning +algorithms, identifying that knowledge coverage and the size of the forget set +play pivotal roles. Notably, our analysis also uncovers that entities +introduced through fine-tuning are more vulnerable to unlearning than +pre-trained entities. These findings collectively offer valuable insights for +advancing entity-level unlearning for LLMs. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Elsevier Arena: Human Evaluation of Chemistry/Biology/Health + Foundational Large Language Models + + +
+ arXiv admin comment: This version has been removed by arXiv administrators as +the submitter did not have the rights to agree to the license at the time of +submission + +
+
+ comment: This document was submitted without obtaining all necessary + permissions and therefore needs to be withdrawn. The corresponding author + apologizes for any inconvenience this might cause +
+
+
+
+
+ + ♻ ☆ Generating Synthetic Free-text Medical Records with Low + Re-identification Risk using Masked Language Modeling + + +
+ In this paper, we present a system that generates synthetic free-text medical +records, such as discharge summaries, admission notes and doctor +correspondences, using Masked Language Modeling (MLM). Our system is designed +to preserve the critical information of the records while introducing +significant diversity and minimizing re-identification risk. The system +incorporates a de-identification component that uses Philter to mask Protected +Health Information (PHI), followed by a Medical Entity Recognition (NER) model +to retain key medical information. We explore various masking ratios and +mask-filling techniques to balance the trade-off between diversity and fidelity +in the synthetic outputs without affecting overall readability. Our results +demonstrate that the system can produce high-quality synthetic data with +significant diversity while achieving a HIPAA-compliant PHI recall rate of 0.96 +and a low re-identification risk of 0.035. Furthermore, downstream evaluations +using a NER task reveal that the synthetic data can be effectively used to +train models with performance comparable to those trained on real data. The +flexibility of the system allows it to be adapted for specific use cases, +making it a valuable tool for privacy-preserving data generation in medical +research and healthcare applications. + +
+
+ comment: Added references and rephrased some sentences +
+
+
+
+
+ + ♻ ☆ Large language models can replicate cross-cultural differences in + personality + + +
+ We use a large-scale experiment (N=8000) to determine whether GPT-4 can +replicate cross-cultural differences in the Big Five, measured using the +Ten-Item Personality Inventory. We used the US and South Korea as the cultural +pair, given that prior research suggests substantial personality differences +between people from these two countries. We manipulated the target of the +simulation (US vs. Korean), the language of the inventory (English vs. Korean), +and the language model (GPT-4 vs. GPT-3.5). Our results show that GPT-4 +replicated the cross-cultural differences for each factor. However, mean +ratings had an upward bias and exhibited lower variation than in the human +samples, as well as lower structural validity. We provide preliminary evidence +that LLMs can aid cross-cultural researchers and practitioners. + +
+
+ comment: 27 pages: 12 pages of manuscript + 15 pages of supplementary + materials +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 30 + +
+
+
+ + ☆ Self-Contrastive Forward-Forward Algorithm + + +
+ The Forward-Forward (FF) algorithm is a recent, purely forward-mode learning +method, that updates weights locally and layer-wise and supports supervised as +well as unsupervised learning. These features make it ideal for applications +such as brain-inspired learning, low-power hardware neural networks, and +distributed learning in large models. However, while FF has shown promise on +written digit recognition tasks, its performance on natural images and +time-series remains a challenge. A key limitation is the need to generate +high-quality negative examples for contrastive learning, especially in +unsupervised tasks, where versatile solutions are currently lacking. To address +this, we introduce the Self-Contrastive Forward-Forward (SCFF) method, inspired +by self-supervised contrastive learning. SCFF generates positive and negative +examples applicable across different datasets, surpassing existing local +forward algorithms for unsupervised classification accuracy on MNIST (MLP: +98.7%), CIFAR-10 (CNN: 80.75%), and STL-10 (CNN: 77.3%). Additionally, SCFF is +the first to enable FF training of recurrent neural networks, opening the door +to more complex tasks and continuous-time video and text processing. + +
+
+
+
+
+ + ☆ Preference Tuning with Human Feedback on Language, Speech, and Vision + Tasks: A Survey + + +
+ Preference tuning is a crucial process for aligning deep generative models +with human preferences. This survey offers a thorough overview of recent +advancements in preference tuning and the integration of human feedback. The +paper is organized into three main sections: 1) introduction and preliminaries: +an introduction to reinforcement learning frameworks, preference tuning tasks, +models, and datasets across various modalities: language, speech, and vision, +as well as different policy approaches, 2) in-depth examination of each +preference tuning approach: a detailed analysis of the methods used in +preference tuning, and 3) applications, discussion, and future directions: an +exploration of the applications of preference tuning in downstream tasks, +including evaluation methods for different modalities, and an outlook on future +research directions. Our objective is to present the latest methodologies in +preference tuning and model alignment, enhancing the understanding of this +field for researchers and practitioners. We hope to encourage further +engagement and innovation in this area. + +
+
+ comment: Survey paper +
+
+
+
+
+ + ☆ Open-Set Semantic Uncertainty Aware Metric-Semantic Graph Matching + + +
+ Underwater object-level mapping requires incorporating visual foundation +models to handle the uncommon and often previously unseen object classes +encountered in marine scenarios. In this work, a metric of semantic uncertainty +for open-set object detections produced by visual foundation models is +calculated and then incorporated into an object-level uncertainty tracking +framework. Object-level uncertainties and geometric relationships between +objects are used to enable robust object-level loop closure detection for +unknown object classes. The above loop closure detection problem is formulated +as a graph-matching problem. While graph matching, in general, is NP-Complete, +a solver for an equivalent formulation of the proposed graph matching problem +as a graph editing problem is tested on multiple challenging underwater scenes. +Results for this solver as well as three other solvers demonstrate that the +proposed methods are feasible for real-time use in marine environments for the +robust, open-set, multi-object, semantic-uncertainty-aware loop closure +detection. Further experimental results on the KITTI dataset demonstrate that +the method generalizes to large-scale terrestrial scenes. + +
+
+
+
+
+ + ☆ Multi-Domain Data Aggregation for Axon and Myelin Segmentation in + Histology Images + + +
+ Quantifying axon and myelin properties (e.g., axon diameter, myelin +thickness, g-ratio) in histology images can provide useful information about +microstructural changes caused by neurodegenerative diseases. Automatic tissue +segmentation is an important tool for these datasets, as a single stained +section can contain up to thousands of axons. Advances in deep learning have +made this task quick and reliable with minimal overhead, but a deep learning +model trained by one research group will hardly ever be usable by other groups +due to differences in their histology training data. This is partly due to +subject diversity (different body parts, species, genetics, pathologies) and +also to the range of modern microscopy imaging techniques resulting in a wide +variability of image features (i.e., contrast, resolution). There is a pressing +need to make AI accessible to neuroscience researchers to facilitate and +accelerate their workflow, but publicly available models are scarce and poorly +maintained. Our approach is to aggregate data from multiple imaging modalities +(bright field, electron microscopy, Raman spectroscopy) and species (mouse, +rat, rabbit, human), to create an open-source, durable tool for axon and myelin +segmentation. Our generalist model makes it easier for researchers to process +their data and can be fine-tuned for better performance on specific domains. We +study the benefits of different aggregation schemes. This multi-domain +segmentation model performs better than single-modality dedicated learners +(p=0.03077), generalizes better on out-of-distribution data and is easier to +use and maintain. Importantly, we package the segmentation tool into a +well-maintained open-source software ecosystem (see +https://github.com/axondeepseg/axondeepseg). + +
+
+ comment: 12 pages, 8 figures +
+
+
+
+
+ + ☆ NCT-CRC-HE: Not All Histopathological Datasets Are Equally Useful + + +
+ Numerous deep learning-based solutions have been proposed for +histopathological image analysis over the past years. While they usually +demonstrate exceptionally high accuracy, one key question is whether their +precision might be affected by low-level image properties not related to +histopathology but caused by microscopy image handling and pre-processing. In +this paper, we analyze a popular NCT-CRC-HE-100K colorectal cancer dataset used +in numerous prior works and show that both this dataset and the obtained +results may be affected by data-specific biases. The most prominent revealed +dataset issues are inappropriate color normalization, severe JPEG artifacts +inconsistent between different classes, and completely corrupted tissue samples +resulting from incorrect image dynamic range handling. We show that even the +simplest model using only 3 features per image (red, green and blue color +intensities) can demonstrate over 50% accuracy on this 9-class dataset, while +using color histogram not explicitly capturing cell morphology features yields +over 82% accuracy. Moreover, we show that a basic EfficientNet-B0 ImageNet +pretrained model can achieve over 97.7% accuracy on this dataset, outperforming +all previously proposed solutions developed for this task, including dedicated +foundation histopathological models and large cell morphology-aware neural +networks. The NCT-CRC-HE dataset is publicly available and can be freely used +to replicate the presented results. The codes and pre-trained models used in +this paper are available at +https://github.com/gmalivenko/NCT-CRC-HE-experiments + +
+
+
+
+
+ + ☆ VALO: A Versatile Anytime Framework for LiDAR-based Object Detection + Deep Neural Networks + + +
+ This work addresses the challenge of adapting dynamic deadline requirements +for LiDAR object detection deep neural networks (DNNs). The computing latency +of object detection is critically important to ensure safe and efficient +navigation. However, state-of-the-art LiDAR object detection DNNs often exhibit +significant latency, hindering their real-time performance on +resource-constrained edge platforms. Therefore, a tradeoff between detection +accuracy and latency should be dynamically managed at runtime to achieve +optimum results. + In this paper, we introduce VALO (Versatile Anytime algorithm for LiDAR +Object detection), a novel data-centric approach that enables anytime computing +of 3D LiDAR object detection DNNs. VALO employs a deadline-aware scheduler to +selectively process input regions, making execution time and accuracy tradeoffs +without architectural modifications. Additionally, it leverages efficient +forecasting of past detection results to mitigate possible loss of accuracy due +to partial processing of input. Finally, it utilizes a novel input reduction +technique within its detection heads to significantly accelerate execution +without sacrificing accuracy. + We implement VALO on state-of-the-art 3D LiDAR object detection networks, +namely CenterPoint and VoxelNext, and demonstrate its dynamic adaptability to a +wide range of time constraints while achieving higher accuracy than the prior +state-of-the-art. Code is available +athttps://github.com/CSL-KU/VALO}{github.com/CSL-KU/VALO. + +
+
+
+
+
+ + ☆ Obfuscation Based Privacy Preserving Representations are Recoverable + Using Neighborhood Information + + +
+ Rapid growth in the popularity of AR/VR/MR applications and cloud-based +visual localization systems has given rise to an increased focus on the privacy +of user content in the localization process. + This privacy concern has been further escalated by the ability of deep neural +networks to recover detailed images of a scene from a sparse set of 3D or 2D +points and their descriptors - the so-called inversion attacks. + Research on privacy-preserving localization has therefore focused on +preventing these inversion attacks on both the query image keypoints and the 3D +points of the scene map. + To this end, several geometry obfuscation techniques that lift points to +higher-dimensional spaces, i.e., lines or planes, or that swap coordinates +between points % have been proposed. + In this paper, we point to a common weakness of these obfuscations that +allows to recover approximations of the original point positions under the +assumption of known neighborhoods. + We further show that these neighborhoods can be computed by learning to +identify descriptors that co-occur in neighborhoods. + Extensive experiments show that our approach for point recovery is +practically applicable to all existing geometric obfuscation schemes. + Our results show that these schemes should not be considered +privacy-preserving, even though they are claimed to be privacy-preserving. + Code will be available at +\url{https://github.com/kunalchelani/RecoverPointsNeighborhood}. + +
+
+
+
+
+ + ☆ Unsupervised Hybrid framework for ANomaly Detection (HAND) -- applied to + Screening Mammogram + + +
+ Out-of-distribution (OOD) detection is crucial for enhancing the +generalization of AI models used in mammogram screening. Given the challenge of +limited prior knowledge about OOD samples in external datasets, unsupervised +generative learning is a preferable solution which trains the model to discern +the normal characteristics of in-distribution (ID) data. The hypothesis is that +during inference, the model aims to reconstruct ID samples accurately, while +OOD samples exhibit poorer reconstruction due to their divergence from +normality. Inspired by state-of-the-art (SOTA) hybrid architectures combining +CNNs and transformers, we developed a novel backbone - HAND, for detecting OOD +from large-scale digital screening mammogram studies. To boost the learning +efficiency, we incorporated synthetic OOD samples and a parallel discriminator +in the latent space to distinguish between ID and OOD samples. Gradient +reversal to the OOD reconstruction loss penalizes the model for learning OOD +reconstructions. An anomaly score is computed by weighting the reconstruction +and discriminator loss. On internal RSNA mammogram held-out test and external +Mayo clinic hand-curated dataset, the proposed HAND model outperformed +encoder-based and GAN-based baselines, and interestingly, it also outperformed +the hybrid CNN+transformer baselines. Therefore, the proposed HAND pipeline +offers an automated efficient computational solution for domain-specific +quality checks in external screening mammograms, yielding actionable insights +without direct exposure to the private medical imaging data. + +
+
+
+
+
+ + ☆ Robot Manipulation in Salient Vision through Referring Image + Segmentation and Geometric Constraints + + +
+ In this paper, we perform robot manipulation activities in real-world +environments with language contexts by integrating a compact referring image +segmentation model into the robot's perception module. First, we propose +CLIPU$^2$Net, a lightweight referring image segmentation model designed for +fine-grain boundary and structure segmentation from language expressions. Then, +we deploy the model in an eye-in-hand visual servoing system to enact robot +control in the real world. The key to our system is the representation of +salient visual information as geometric constraints, linking the robot's visual +perception to actionable commands. Experimental results on 46 real-world robot +manipulation tasks demonstrate that our method outperforms traditional visual +servoing methods relying on labor-intensive feature annotations, excels in +fine-grain referring image segmentation with a compact decoder size of 6.6 MB, +and supports robot control across diverse contexts. + +
+
+
+
+
+ + ☆ Mamba Fusion: Learning Actions Through Questioning + + +
+ Video Language Models (VLMs) are crucial for generalizing across diverse +tasks and using language cues to enhance learning. While transformer-based +architectures have been the de facto in vision-language training, they face +challenges like quadratic computational complexity, high GPU memory usage, and +difficulty with long-term dependencies. To address these limitations, we +introduce MambaVL, a novel model that leverages recent advancements in +selective state space modality fusion to efficiently capture long-range +dependencies and learn joint representations for vision and language data. +MambaVL utilizes a shared state transition matrix across both modalities, +allowing the model to capture information about actions from multiple +perspectives within the scene. Furthermore, we propose a question-answering +task that helps guide the model toward relevant cues. These questions provide +critical information about actions, objects, and environmental context, leading +to enhanced performance. As a result, MambaVL achieves state-of-the-art +performance in action recognition on the Epic-Kitchens-100 dataset and +outperforms baseline methods in action anticipation. + +
+
+
+
+
+ + ☆ Good Grasps Only: A data engine for self-supervised fine-tuning of pose + estimation using grasp poses for verification + + +
+ In this paper, we present a novel method for self-supervised fine-tuning of +pose estimation for bin-picking. Leveraging zero-shot pose estimation, our +approach enables the robot to automatically obtain training data without manual +labeling. After pose estimation the object is grasped, and in-hand pose +estimation is used for data validation. Our pipeline allows the system to +fine-tune while the process is running, removing the need for a learning phase. + The motivation behind our work lies in the need for rapid setup of pose +estimation solutions. Specifically, we address the challenging task of bin +picking, which plays a pivotal role in flexible robotic setups. + Our method is implemented on a robotics work-cell, and tested with four +different objects. For all objects, our method increases the performance and +outperforms a state-of-the-art method trained on the CAD model of the objects. + +
+
+ comment: 8 pages, 7 figures, 3 tables +
+
+
+
+
+ + ☆ Retinal Vessel Segmentation with Deep Graph and Capsule Reasoning + + +
+ Effective retinal vessel segmentation requires a sophisticated integration of +global contextual awareness and local vessel continuity. To address this +challenge, we propose the Graph Capsule Convolution Network (GCC-UNet), which +merges capsule convolutions with CNNs to capture both local and global +features. The Graph Capsule Convolution operator is specifically designed to +enhance the representation of global context, while the Selective Graph +Attention Fusion module ensures seamless integration of local and global +information. To further improve vessel continuity, we introduce the Bottleneck +Graph Attention module, which incorporates Channel-wise and Spatial Graph +Attention mechanisms. The Multi-Scale Graph Fusion module adeptly combines +features from various scales. Our approach has been rigorously validated +through experiments on widely used public datasets, with ablation studies +confirming the efficacy of each component. Comparative results highlight +GCC-UNet's superior performance over existing methods, setting a new benchmark +in retinal vessel segmentation. Notably, this work represents the first +integration of vanilla, graph, and capsule convolutional techniques in the +domain of medical image segmentation. + +
+
+
+
+
+ + ☆ Phidias: A Generative Model for Creating 3D Content from Text, Image, + and 3D Conditions with Reference-Augmented Diffusion + + +
+ In 3D modeling, designers often use an existing 3D model as a reference to +create new ones. This practice has inspired the development of Phidias, a novel +generative model that uses diffusion for reference-augmented 3D generation. +Given an image, our method leverages a retrieved or user-provided 3D reference +model to guide the generation process, thereby enhancing the generation +quality, generalization ability, and controllability. Our model integrates +three key components: 1) meta-ControlNet that dynamically modulates the +conditioning strength, 2) dynamic reference routing that mitigates misalignment +between the input image and 3D reference, and 3) self-reference augmentations +that enable self-supervised training with a progressive curriculum. +Collectively, these designs result in a clear improvement over existing +methods. Phidias establishes a unified framework for 3D generation using text, +image, and 3D conditions with versatile applications. + +
+
+ comment: Project page: https://RAG-3D.github.io/ +
+
+
+
+
+ + ☆ NVLM: Open Frontier-Class Multimodal LLMs + + +
+ We introduce NVLM 1.0, a family of frontier-class multimodal large language +models (LLMs) that achieve state-of-the-art results on vision-language tasks, +rivaling the leading proprietary models (e.g., GPT-4o) and open-access models +(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved +text-only performance over its LLM backbone after multimodal training. In terms +of model design, we perform a comprehensive comparison between decoder-only +multimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g., +Flamingo). Based on the strengths and weaknesses of both approaches, we propose +a novel architecture that enhances both training efficiency and multimodal +reasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for +tile-based dynamic high-resolution images, which significantly boosts +performance on multimodal reasoning and OCR-related tasks. Regarding training +data, we meticulously curate and provide detailed information on our multimodal +pretraining and supervised fine-tuning datasets. Our findings indicate that +dataset quality and task diversity are more important than scale, even during +the pretraining phase, across all architectures. Notably, we develop +production-grade multimodality for the NVLM-1.0 models, enabling them to excel +in vision-language tasks while maintaining and even improving text-only +performance compared to their LLM backbones. To achieve this, we craft and +integrate a high-quality text-only dataset into multimodal training, alongside +a substantial amount of multimodal math and reasoning data, leading to enhanced +math and coding capabilities across modalities. To advance research in the +field, we are releasing the model weights and will open-source the code for the +community: https://nvlm-project.github.io/. + +
+
+
+
+
+ + ☆ Two Stage Segmentation of Cervical Tumors using PocketNet + + +
+ Cervical cancer remains the fourth most common malignancy amongst women +worldwide.1 Concurrent chemoradiotherapy (CRT) serves as the mainstay +definitive treatment regimen for locally advanced cervical cancers and includes +external beam radiation followed by brachytherapy.2 Integral to radiotherapy +treatment planning is the routine contouring of both the target tumor at the +level of the cervix, associated gynecologic anatomy and the adjacent organs at +risk (OARs). However, manual contouring of these structures is both time and +labor intensive and associated with known interobserver variability that can +impact treatment outcomes. While multiple tools have been developed to +automatically segment OARs and the high-risk clinical tumor volume (HR-CTV) +using computed tomography (CT) images,3,4,5,6 the development of deep +learning-based tumor segmentation tools using routine T2-weighted (T2w) +magnetic resonance imaging (MRI) addresses an unmet clinical need to improve +the routine contouring of both anatomical structures and cervical cancers, +thereby increasing quality and consistency of radiotherapy planning. This work +applied a novel deep-learning model (PocketNet) to segment the cervix, vagina, +uterus, and tumor(s) on T2w MRI. The performance of the PocketNet architecture +was evaluated, when trained on data via 5-fold cross validation. PocketNet +achieved a mean Dice-Sorensen similarity coefficient (DSC) exceeding 70% for +tumor segmentation and 80% for organ segmentation. These results suggest that +PocketNet is robust to variations in contrast protocols, providing reliable +segmentation of the ROIs. + +
+
+
+
+
+ + ☆ Training Datasets Generation for Machine Learning: Application to Vision + Based Navigation SP + + +
+ Vision Based Navigation consists in utilizing cameras as precision sensors +for GNC after extracting information from images. To enable the adoption of +machine learning for space applications, one of obstacles is the demonstration +that available training datasets are adequate to validate the algorithms. The +objective of the study is to generate datasets of images and metadata suitable +for training machine learning algorithms. Two use cases were selected and a +robust methodology was developed to validate the datasets including the ground +truth. The first use case is in-orbit rendezvous with a man-made object: a +mockup of satellite ENVISAT. The second use case is a Lunar landing scenario. +Datasets were produced from archival datasets (Chang'e 3), from the laboratory +at DLR TRON facility and at Airbus Robotic laboratory, from SurRender software +high fidelity image simulator using Model Capture and from Generative +Adversarial Networks. The use case definition included the selection of +algorithms as benchmark: an AI-based pose estimation algorithm and a dense +optical flow algorithm were selected. Eventually it is demonstrated that +datasets produced with SurRender and selected laboratory facilities are +adequate to train machine learning algorithms. + +
+
+ comment: 6 pages, 4 figures, preprint of the proceedings of ESA SPAICE + conference 2024 +
+
+
+
+
+ + ☆ Ultrasound Image Enhancement with the Variance of Diffusion Models + + +
+ Ultrasound imaging, despite its widespread use in medicine, often suffers +from various sources of noise and artifacts that impact the signal-to-noise +ratio and overall image quality. Enhancing ultrasound images requires a +delicate balance between contrast, resolution, and speckle preservation. This +paper introduces a novel approach that integrates adaptive beamforming with +denoising diffusion-based variance imaging to address this challenge. By +applying Eigenspace-Based Minimum Variance (EBMV) beamforming and employing a +denoising diffusion model fine-tuned on ultrasound data, our method computes +the variance across multiple diffusion-denoised samples to produce high-quality +despeckled images. This approach leverages both the inherent multiplicative +noise of ultrasound and the stochastic nature of diffusion models. Experimental +results on a publicly available dataset demonstrate the effectiveness of our +method in achieving superior image reconstructions from single plane-wave +acquisitions. The code is available at: +https://github.com/Yuxin-Zhang-Jasmine/IUS2024_Diffusion. + +
+
+ comment: Accepted by the IEEE International Ultrasonics Symposium (IUS) 2024 +
+
+
+
+
+ + ☆ Multi-OCT-SelfNet: Integrating Self-Supervised Learning with + Multi-Source Data Fusion for Enhanced Multi-Class Retinal Disease + Classification + + +
+ In the medical domain, acquiring large datasets poses significant challenges +due to privacy concerns. Nonetheless, the development of a robust deep-learning +model for retinal disease diagnosis necessitates a substantial dataset for +training. The capacity to generalize effectively on smaller datasets remains a +persistent challenge. The scarcity of data presents a significant barrier to +the practical implementation of scalable medical AI solutions. To address this +issue, we've combined a wide range of data sources to improve performance and +generalization to new data by giving it a deeper understanding of the data +representation from multi-modal datasets and developed a self-supervised +framework based on large language models (LLMs), SwinV2 to gain a deeper +understanding of multi-modal dataset representations, enhancing the model's +ability to extrapolate to new data for the detection of eye diseases using +optical coherence tomography (OCT) images. We adopt a two-phase training +methodology, self-supervised pre-training, and fine-tuning on a downstream +supervised classifier. An ablation study conducted across three datasets +employing various encoder backbones, without data fusion, with low data +availability setting, and without self-supervised pre-training scenarios, +highlights the robustness of our method. Our findings demonstrate consistent +performance across these diverse conditions, showcasing superior generalization +capabilities compared to the baseline model, ResNet-50. + +
+
+ comment: 25 pages, 9 tables, 10 figures +
+
+
+
+
+ + ☆ Uncertainty and Prediction Quality Estimation for Semantic Segmentation + via Graph Neural Networks BMVC + + +
+ When employing deep neural networks (DNNs) for semantic segmentation in +safety-critical applications like automotive perception or medical imaging, it +is important to estimate their performance at runtime, e.g. via uncertainty +estimates or prediction quality estimates. Previous works mostly performed +uncertainty estimation on pixel-level. In a line of research, a +connected-component-wise (segment-wise) perspective was taken, approaching +uncertainty estimation on an object-level by performing so-called meta +classification and regression to estimate uncertainty and prediction quality, +respectively. In those works, each predicted segment is considered individually +to estimate its uncertainty or prediction quality. However, the neighboring +segments may provide additional hints on whether a given predicted segment is +of high quality, which we study in the present work. On the basis of +uncertainty indicating metrics on segment-level, we use graph neural networks +(GNNs) to model the relationship of a given segment's quality as a function of +the given segment's metrics as well as those of its neighboring segments. We +compare different GNN architectures and achieve a notable performance +improvement. + +
+
+ comment: 11 pages, 3 figures, submitted to BMVC "Workshop on Robust + Recognition in the Open World" (https://rrow2024.github.io/call-for-papers) +
+
+
+
+
+ + ☆ Compact Implicit Neural Representations for Plane Wave Images + + +
+ Ultrafast Plane-Wave (PW) imaging often produces artifacts and shadows that +vary with insonification angles. We propose a novel approach using Implicit +Neural Representations (INRs) to compactly encode multi-planar sequences while +preserving crucial orientation-dependent information. To our knowledge, this is +the first application of INRs for PW angular interpolation. Our method employs +a Multi-Layer Perceptron (MLP)-based model with a concise physics-enhanced +rendering technique. Quantitative evaluations using SSIM, PSNR, and standard +ultrasound metrics, along with qualitative visual assessments, confirm the +effectiveness of our approach. Additionally, our method demonstrates +significant storage efficiency, with model weights requiring 530 KB compared to +8 MB for directly storing the 75 PW images, achieving a notable compression +ratio of approximately 15:1. + +
+
+ comment: Accepted by the IEEE International Ultrasonics Symposium (IUS) 2024 +
+
+
+
+
+ + ♻ ☆ SEDMamba: Enhancing Selective State Space Modelling with Bottleneck + Mechanism and Fine-to-Coarse Temporal Fusion for Efficient Error Detection in + Robot-Assisted Surgery + + +
+ Automated detection of surgical errors can improve robotic-assisted surgery. +Despite promising progress, existing methods still face challenges in capturing +rich temporal context to establish long-term dependencies while maintaining +computational efficiency. In this paper, we propose a novel hierarchical model +named SEDMamba, which incorporates the selective state space model (SSM) into +surgical error detection, facilitating efficient long sequence modelling with +linear complexity. SEDMamba enhances selective SSM with a bottleneck mechanism +and fine-to-coarse temporal fusion (FCTF) to detect and temporally localize +surgical errors in long videos. The bottleneck mechanism compresses and +restores features within their spatial dimension, thereby reducing +computational complexity. FCTF utilizes multiple dilated 1D convolutional +layers to merge temporal information across diverse scale ranges, accommodating +errors of varying duration. Our work also contributes the first-of-its-kind, +frame-level, in-vivo surgical error dataset to support error detection in real +surgical cases. Specifically, we deploy the clinically validated observational +clinical human reliability assessment tool (OCHRA) to annotate the errors +during suturing tasks in an open-source radical prostatectomy dataset +(SAR-RARP50). Experimental results demonstrate that our SEDMamba outperforms +state-of-the-art methods with at least 1.82% AUC and 3.80% AP performance gains +with significantly reduced computational complexity. The corresponding error +annotations, code and models will be released at +https://github.com/wzjialang/SEDMamba. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ♻ ☆ How to Determine the Preferred Image Distribution of a Black-Box + Vision-Language Model? + + +
+ Large foundation models have revolutionized the field, yet challenges remain +in optimizing multi-modal models for specialized visual tasks. We propose a +novel, generalizable methodology to identify preferred image distributions for +black-box Vision-Language Models (VLMs) by measuring output consistency across +varied input prompts. Applying this to different rendering types of 3D objects, +we demonstrate its efficacy across various domains requiring precise +interpretation of complex structures, with a focus on Computer-Aided Design +(CAD) as an exemplar field. We further refine VLM outputs using in-context +learning with human feedback, significantly enhancing explanation quality. To +address the lack of benchmarks in specialized domains, we introduce CAD-VQA, a +new dataset for evaluating VLMs on CAD-related visual question answering tasks. +Our evaluation of state-of-the-art VLMs on CAD-VQA establishes baseline +performance levels, providing a framework for advancing VLM capabilities in +complex visual reasoning tasks across various fields requiring expert-level +visual interpretation. We release the dataset and evaluation codes at +\url{https://github.com/asgsaeid/cad_vqa}. + +
+
+
+
+
+ + ♻ ☆ Hyper-STTN: Social Group-aware Spatial-Temporal Transformer Network for + Human Trajectory Prediction with Hypergraph Reasoning + + +
+ Predicting crowded intents and trajectories is crucial in varouls real-world +applications, including service robots and autonomous vehicles. Understanding +environmental dynamics is challenging, not only due to the complexities of +modeling pair-wise spatial and temporal interactions but also the diverse +influence of group-wise interactions. To decode the comprehensive pair-wise and +group-wise interactions in crowded scenarios, we introduce Hyper-STTN, a +Hypergraph-based Spatial-Temporal Transformer Network for crowd trajectory +prediction. In Hyper-STTN, crowded group-wise correlations are constructed +using a set of multi-scale hypergraphs with varying group sizes, captured +through random-walk robability-based hypergraph spectral convolution. +Additionally, a spatial-temporal transformer is adapted to capture pedestrians' +pair-wise latent interactions in spatial-temporal dimensions. These +heterogeneous group-wise and pair-wise are then fused and aligned though a +multimodal transformer network. Hyper-STTN outperformes other state-of-the-art +baselines and ablation models on 5 real-world pedestrian motion datasets. + +
+
+
+
+
+ + ♻ ☆ MARS: Mask Attention Refinement with Sequential Quadtree Nodes for Car + Damage Instance Segmentation + + +
+ Evaluating car damages from misfortune is critical to the car insurance +industry. However, the accuracy is still insufficient for real-world +applications since the deep learning network is not designed for car damage +images as inputs, and its segmented masks are still very coarse. This paper +presents MARS (Mask Attention Refinement with Sequential quadtree nodes) for +car damage instance segmentation. Our MARS represents self-attention mechanisms +to draw global dependencies between the sequential quadtree nodes layer and +quadtree transformer to recalibrate channel weights and predict highly accurate +instance masks. Our extensive experiments demonstrate that MARS outperforms +state-of-the-art (SOTA) instance segmentation methods on three popular +benchmarks such as Mask R-CNN [9], PointRend [13], and Mask Transfiner [12], by +a large margin of +1.3 maskAP-based R50-FPN backbone and +2.3 maskAP-based +R101-FPN backbone on Thai car-damage dataset. Our demos are available at +https://github.com/kaopanboonyuen/MARS. + +
+
+ comment: 14 pages. arXiv admin note: substantial text overlap with + arXiv:2111.13673 by other authors +
+
+
+
+
+ + ♻ ☆ Towards Secure and Usable 3D Assets: A Novel Framework for Automatic + Visible Watermarking WACV2025 + + +
+ 3D models, particularly AI-generated ones, have witnessed a recent surge +across various industries such as entertainment. Hence, there is an alarming +need to protect the intellectual property and avoid the misuse of these +valuable assets. As a viable solution to address these concerns, we rigorously +define the novel task of automated 3D visible watermarking in terms of two +competing aspects: watermark quality and asset utility. Moreover, we propose a +method of embedding visible watermarks that automatically determines the right +location, orientation, and number of watermarks to be placed on arbitrary 3D +assets for high watermark quality and asset utility. Our method is based on a +novel rigid-body optimization that uses back-propagation to automatically learn +transforms for ideal watermark placement. In addition, we propose a novel +curvature-matching method for fusing the watermark into the 3D model that +further improves readability and security. Finally, we provide a detailed +experimental analysis on two benchmark 3D datasets validating the superior +performance of our approach in comparison to baselines. Code and demo are +available. + +
+
+ comment: Accepted to WACV2025 +
+
+
+
+
+ + ♻ ☆ Enhancing Worldwide Image Geolocation by Ensembling Satellite-Based + Ground-Level Attribute Predictors + + +
+ We examine the challenge of estimating the location of a single ground-level +image in the absence of GPS or other location metadata. Currently, geolocation +systems are evaluated by measuring the Great Circle Distance between the +predicted location and ground truth. Because this measurement only uses a +single point, it cannot assess the distribution of predictions by geolocation +systems. Evaluation of a distribution of potential locations (areas) is +required when there are follow-on procedures to further narrow down or verify +the location. This is especially important in poorly-sampled regions e.g. rural +and wilderness areas. + In this paper, we introduce a novel metric, Recall vs Area (RvA), which +measures the accuracy of estimated distributions of locations. RvA treats image +geolocation results similarly to document retrieval, measuring recall as a +function of area: For a ranked list of (possibly discontiguous) predicted +regions, we measure the area required for accumulated regions to contain the +ground truth coordinate. This produces a curve similar to a precision-recall +curve, where "precision" is replaced by square kilometers area, enabling +evaluation for different downstream search area budgets. + Following from this view of the problem, we then examine an ensembling +approach to global-scale image geolocation, which incorporates information from +multiple sources, and can readily incorporate multiple models, attribute +predictors, and data sources. We study its effectiveness by combining the +geolocation models GeoEstimation and the current state-of-the-art, GeoCLIP, +with attribute predictors based on Oak Ridge National Laboratory LandScan and +European Space Agency Climate Change Initiative Land Cover. We find significant +improvements in image geolocation for areas that are under-represented in the +training set, particularly non-urban areas, on both Im2GPS3k and Street View +images. + +
+
+
+
+
+ + ♻ ☆ GaussianObject: High-Quality 3D Object Reconstruction from Four Views + with Gaussian Splatting + + +
+ Reconstructing and rendering 3D objects from highly sparse views is of +critical importance for promoting applications of 3D vision techniques and +improving user experience. However, images from sparse views only contain very +limited 3D information, leading to two significant challenges: 1) Difficulty in +building multi-view consistency as images for matching are too few; 2) +Partially omitted or highly compressed object information as view coverage is +insufficient. To tackle these challenges, we propose GaussianObject, a +framework to represent and render the 3D object with Gaussian splatting that +achieves high rendering quality with only 4 input images. We first introduce +techniques of visual hull and floater elimination, which explicitly inject +structure priors into the initial optimization process to help build multi-view +consistency, yielding a coarse 3D Gaussian representation. Then we construct a +Gaussian repair model based on diffusion models to supplement the omitted +object information, where Gaussians are further refined. We design a +self-generating strategy to obtain image pairs for training the repair model. +We further design a COLMAP-free variant, where pre-given accurate camera poses +are not required, which achieves competitive quality and facilitates wider +applications. GaussianObject is evaluated on several challenging datasets, +including MipNeRF360, OmniObject3D, OpenIllumination, and our-collected unposed +images, achieving superior performance from only four views and significantly +outperforming previous SOTA methods. + +
+
+ comment: Project page: https://gaussianobject.github.io/ +
+
+
+
+
+ + ♻ ☆ Accelerating Point Cloud Ground Segmentation: From Mechanical to + Solid-State Lidars + + +
+ In this study, we propose a novel parallel processing method for point cloud +ground segmentation, aimed at the technology evolution from mechanical to +solid-state Lidar (SSL). We first benchmark point-based, grid-based, and range +image-based ground segmentation algorithms using the SemanticKITTI dataset. Our +results indicate that the range image-based method offers superior performance +and robustness, particularly in resilience to frame slicing. Implementing the +proposed algorithm on an FPGA demonstrates significant improvements in +processing speed and scalability of resource usage. Additionally, we develop a +custom dataset using camera-SSL equipment on our test vehicle to validate the +effectiveness of the parallel processing approach for SSL frames in real world, +achieving processing rates up to 30.9 times faster than CPU implementations. +These findings underscore the potential of parallel processing strategies to +enhance Lidar technologies for advanced perception tasks in autonomous vehicles +and robotics. The data and code will be available post-publication on our +GitHub repository: +\url{https://github.com/WPI-APA-Lab/GroundSeg-Solid-State-Lidar-Parallel-Processing} + +
+
+ comment: 6 pages +
+
+
+
+
+ + ♻ ☆ SIG: A Synthetic Identity Generation Pipeline for Generating Evaluation + Datasets for Face Recognition + + +
+ As Artificial Intelligence applications expand, the evaluation of models +faces heightened scrutiny. Ensuring public readiness requires evaluation +datasets, which differ from training data by being disjoint and ethically +sourced in compliance with privacy regulations. The performance and fairness of +face recognition systems depend significantly on the quality and +representativeness of these evaluation datasets. This data is sometimes scraped +from the internet without user's consent, causing ethical concerns that can +prohibit its use without proper releases. In rare cases, data is collected in a +controlled environment with consent, however, this process is time-consuming, +expensive, and logistically difficult to execute. This creates a barrier for +those unable to conjure the immense resources required to gather ethically +sourced evaluation datasets. To address these challenges, we introduce the +Synthetic Identity Generation pipeline, or SIG, that allows for the targeted +creation of ethical, balanced datasets for face recognition evaluation. Our +proposed and demonstrated pipeline generates high-quality images of synthetic +identities with controllable pose, facial features, and demographic attributes, +such as race, gender, and age. We also release an open-source evaluation +dataset named ControlFace10k, consisting of 10,008 face images of 3,336 unique +synthetic identities balanced across race, gender, and age, generated using the +proposed SIG pipeline. We analyze ControlFace10k along with a non-synthetic +BUPT dataset using state-of-the-art face recognition algorithms to demonstrate +its effectiveness as an evaluation tool. This analysis highlights the dataset's +characteristics and its utility in assessing algorithmic bias across different +demographic groups. + +
+
+
+
+
+ + ♻ ☆ S$^3$Attention: Improving Long Sequence Attention with Smoothed Skeleton + Sketching + + +
+ Attention based models have achieved many remarkable breakthroughs in +numerous applications. However, the quadratic complexity of Attention makes the +vanilla Attention based models hard to apply to long sequence tasks. Various +improved Attention structures are proposed to reduce the computation cost by +inducing low rankness and approximating the whole sequence by sub-sequences. +The most challenging part of those approaches is maintaining the proper balance +between information preservation and computation reduction: the longer +sub-sequences used, the better information is preserved, but at the price of +introducing more noise and computational costs. In this paper, we propose a +smoothed skeleton sketching based Attention structure, coined S$^3$Attention, +which significantly improves upon the previous attempts to negotiate this +trade-off. S$^3$Attention has two mechanisms to effectively minimize the impact +of noise while keeping the linear complexity to the sequence length: a +smoothing block to mix information over long sequences and a matrix sketching +method that simultaneously selects columns and rows from the input matrix. We +verify the effectiveness of S$^3$Attention both theoretically and empirically. +Extensive studies over Long Range Arena (LRA) datasets and six time-series +forecasting show that S$^3$Attention significantly outperforms both vanilla +Attention and other state-of-the-art variants of Attention structures. + +
+
+
+
+
+
+
+
+ + Information Retrieval 14 + +
+
+
+ + ☆ Towards Fair RAG: On the Impact of Fair Ranking in Retrieval-Augmented + Generation + + +
+ Many language models now enhance their responses with retrieval capabilities, +leading to the widespread adoption of retrieval-augmented generation (RAG) +systems. However, despite retrieval being a core component of RAG, much of the +research in this area overlooks the extensive body of work on fair ranking, +neglecting the importance of considering all stakeholders involved. This paper +presents the first systematic evaluation of RAG systems integrated with fair +rankings. We focus specifically on measuring the fair exposure of each relevant +item across the rankings utilized by RAG systems (i.e., item-side fairness), +aiming to promote equitable growth for relevant item providers. To gain a deep +understanding of the relationship between item-fairness, ranking quality, and +generation quality in the context of RAG, we analyze nine different RAG systems +that incorporate fair rankings across seven distinct datasets. Our findings +indicate that RAG systems with fair rankings can maintain a high level of +generation quality and, in many cases, even outperform traditional RAG systems, +despite the general trend of a tradeoff between ensuring fairness and +maintaining system-effectiveness. We believe our insights lay the groundwork +for responsible and equitable RAG systems and open new avenues for future +research. We publicly release our codebase and dataset at +https://github.com/kimdanny/Fair-RAG. + +
+
+
+
+
+ + ☆ A Framework for Ranking Content Providers Using Prompt Engineering and + Self-Attention Network + + +
+ This paper addresses the problem of ranking Content Providers for Content +Recommendation System. Content Providers are the sources of news and other +types of content, such as lifestyle, travel, gardening. We propose a framework +that leverages explicit user feedback, such as clicks and reactions, and +content-based features, such as writing style and frequency of publishing, to +rank Content Providers for a given topic. We also use language models to +engineer prompts that help us create a ground truth dataset for the previous +unsupervised ranking problem. Using this ground truth, we expand with a +self-attention based network to train on Learning to Rank ListWise task. We +evaluate our framework using online experiments and show that it can improve +the quality, credibility, and diversity of the content recommended to users. + +
+
+
+
+
+ + ☆ Perceptions of Edinburgh: Capturing Neighbourhood Characteristics by + Clustering Geoparsed Local News + + +
+ The communities that we live in affect our health in ways that are complex +and hard to define. Moreover, our understanding of the place-based processes +affecting health and inequalities is limited. This undermines the development +of robust policy interventions to improve local health and well-being. News +media provides social and community information that may be useful in health +studies. Here we propose a methodology for characterising neighbourhoods by +using local news articles. More specifically, we show how we can use Natural +Language Processing (NLP) to unlock further information about neighbourhoods by +analysing, geoparsing and clustering news articles. Our work is novel because +we combine street-level geoparsing tailored to the locality with clustering of +full news articles, enabling a more detailed examination of neighbourhood +characteristics. We evaluate our outputs and show via a confluence of evidence, +both from a qualitative and a quantitative perspective, that the themes we +extract from news articles are sensible and reflect many characteristics of the +real world. This is significant because it allows us to better understand the +effects of neighbourhoods on health. Our findings on neighbourhood +characterisation using news data will support a new generation of place-based +research which examines a wider set of spatial processes and how they affect +health, enabling new epidemiological research. + +
+
+ comment: Preprint - paper under submission +
+
+
+
+
+ + ☆ Beyond Relevance: Improving User Engagement by Personalization for + Short-Video Search + + +
+ Personalized search has been extensively studied in various applications, +including web search, e-commerce, social networks, etc. With the soaring +popularity of short-video platforms, exemplified by TikTok and Kuaishou, the +question arises: can personalization elevate the realm of short-video search, +and if so, which techniques hold the key? + In this work, we introduce $\text{PR}^2$, a novel and comprehensive solution +for personalizing short-video search, where $\text{PR}^2$ stands for the +Personalized Retrieval and Ranking augmented search system. Specifically, +$\text{PR}^2$ leverages query-relevant collaborative filtering and personalized +dense retrieval to extract relevant and individually tailored content from a +large-scale video corpus. Furthermore, it utilizes the QIN (Query-Dominate User +Interest Network) ranking model, to effectively harness user long-term +preferences and real-time behaviors, and efficiently learn from user various +implicit feedback through a multi-task learning framework. By deploying the +$\text{PR}^2$ in production system, we have achieved the most remarkable user +engagement improvements in recent years: a 10.2% increase in CTR@10, a notable +20% surge in video watch time, and a 1.6% uplift of search DAU. We believe the +practical insights presented in this work are valuable especially for building +and improving personalized search systems for the short video platforms. + +
+
+
+
+
+ + ☆ P-RAG: Progressive Retrieval Augmented Generation For Planning on + Embodied Everyday Task + + +
+ Embodied Everyday Task is a popular task in the embodied AI community, +requiring agents to make a sequence of actions based on natural language +instructions and visual observations. Traditional learning-based approaches +face two challenges. Firstly, natural language instructions often lack explicit +task planning. Secondly, extensive training is required to equip models with +knowledge of the task environment. Previous works based on Large Language Model +(LLM) either suffer from poor performance due to the lack of task-specific +knowledge or rely on ground truth as few-shot samples. To address the above +limitations, we propose a novel approach called Progressive Retrieval Augmented +Generation (P-RAG), which not only effectively leverages the powerful language +processing capabilities of LLMs but also progressively accumulates +task-specific knowledge without ground-truth. Compared to the conventional RAG +methods, which retrieve relevant information from the database in a one-shot +manner to assist generation, P-RAG introduces an iterative approach to +progressively update the database. In each iteration, P-RAG retrieves the +latest database and obtains historical information from the previous +interaction as experiential references for the current interaction. Moreover, +we also introduce a more granular retrieval scheme that not only retrieves +similar tasks but also incorporates retrieval of similar situations to provide +more valuable reference experiences. Extensive experiments reveal that P-RAG +achieves competitive results without utilizing ground truth and can even +further improve performance through self-iterations. + +
+
+
+
+
+ + ☆ Evaluation of pretrained language models on music understanding + + +
+ Music-text multimodal systems have enabled new approaches to Music +Information Research (MIR) applications such as audio-to-text and text-to-audio +retrieval, text-based song generation, and music captioning. Despite the +reported success, little effort has been put into evaluating the musical +knowledge of Large Language Models (LLM). In this paper, we demonstrate that +LLMs suffer from 1) prompt sensitivity, 2) inability to model negation (e.g. +'rock song without guitar'), and 3) sensitivity towards the presence of +specific words. We quantified these properties as a triplet-based accuracy, +evaluating the ability to model the relative similarity of labels in a +hierarchical ontology. We leveraged the Audioset ontology to generate triplets +consisting of an anchor, a positive (relevant) label, and a negative (less +relevant) label for the genre and instruments sub-tree. We evaluated the +triplet-based musical knowledge for six general-purpose Transformer-based +models. The triplets obtained through this methodology required filtering, as +some were difficult to judge and therefore relatively uninformative for +evaluation purposes. Despite the relatively high accuracy reported, +inconsistencies are evident in all six models, suggesting that off-the-shelf +LLMs need adaptation to music before use. + +
+
+
+
+
+ + ☆ Promptriever: Instruction-Trained Retrievers Can Be Prompted Like + Language Models + + +
+ Instruction-tuned language models (LM) are able to respond to imperative +commands, providing a more natural user interface compared to their base +counterparts. In this work, we present Promptriever, the first retrieval model +able to be prompted like an LM. To train Promptriever, we curate and release a +new instance-level instruction training set from MS MARCO, spanning nearly 500k +instances. Promptriever not only achieves strong performance on standard +retrieval tasks, but also follows instructions. We observe: (1) large gains +(reaching SoTA) on following detailed relevance instructions (+14.3 p-MRR / ++3.1 nDCG on FollowIR), (2) significantly increased robustness to lexical +choices/phrasing in the query+instruction (+12.9 Robustness@10 on InstructIR), +and (3) the ability to perform hyperparameter search via prompting to reliably +improve retrieval performance (+1.4 average increase on BEIR). Promptriever +demonstrates that retrieval models can be controlled with prompts on a +per-query basis, setting the stage for future work aligning LM prompting +techniques with information retrieval. + +
+
+
+
+
+ + ☆ Multi-modal Generative Models in Recommendation System + + +
+ Many recommendation systems limit user inputs to text strings or behavior +signals such as clicks and purchases, and system outputs to a list of products +sorted by relevance. With the advent of generative AI, users have come to +expect richer levels of interactions. In visual search, for example, a user may +provide a picture of their desired product along with a natural language +modification of the content of the picture (e.g., a dress like the one shown in +the picture but in red color). Moreover, users may want to better understand +the recommendations they receive by visualizing how the product fits their use +case, e.g., with a representation of how a garment might look on them, or how a +furniture item might look in their room. Such advanced levels of interaction +require recommendation systems that are able to discover both shared and +complementary information about the product across modalities, and visualize +the product in a realistic and informative way. However, existing systems often +treat multiple modalities independently: text search is usually done by +comparing the user query to product titles and descriptions, while visual +search is typically done by comparing an image provided by the customer to +product images. We argue that future recommendation systems will benefit from a +multi-modal understanding of the products that leverages the rich information +retailers have about both customers and products to come up with the best +recommendations. In this chapter we review recommendation systems that use +multiple data modalities simultaneously. + +
+
+ comment: 32 pages 5 figures +
+
+
+
+
+ + ☆ Inside Alameda Research: A Multi-Token Network Analysis + + +
+ We analyze the token transfer network on Ethereum, focusing on accounts +associated with Alameda Research, a cryptocurrency trading firm implicated in +the misuse of FTX customer funds. Using a multi-token network representation, +we examine node centralities and the network backbone to identify critical +accounts, tokens, and activity groups. The temporal evolution of Alameda +accounts reveals shifts in token accumulation and distribution patterns leading +up to its bankruptcy in November 2022. Through network analysis, our work +offers insights into the activities and dynamics that shape the DeFi ecosystem. + +
+
+
+
+
+ + ☆ GenCRF: Generative Clustering and Reformulation Framework for Enhanced + Intent-Driven Information Retrieval + + +
+ Query reformulation is a well-known problem in Information Retrieval (IR) +aimed at enhancing single search successful completion rate by automatically +modifying user's input query. Recent methods leverage Large Language Models +(LLMs) to improve query reformulation, but often generate limited and redundant +expansions, potentially constraining their effectiveness in capturing diverse +intents. In this paper, we propose GenCRF: a Generative Clustering and +Reformulation Framework to capture diverse intentions adaptively based on +multiple differentiated, well-generated queries in the retrieval phase for the +first time. GenCRF leverages LLMs to generate variable queries from the initial +query using customized prompts, then clusters them into groups to distinctly +represent diverse intents. Furthermore, the framework explores to combine +diverse intents query with innovative weighted aggregation strategies to +optimize retrieval performance and crucially integrates a novel Query +Evaluation Rewarding Model (QERM) to refine the process through feedback loops. +Empirical experiments on the BEIR benchmark demonstrate that GenCRF achieves +state-of-the-art performance, surpassing previous query reformulation SOTAs by +up to 12% on nDCG@10. These techniques can be adapted to various LLMs, +significantly boosting retriever performance and advancing the field of +Information Retrieval. + +
+
+
+
+
+ + ☆ Attention-Seeker: Dynamic Self-Attention Scoring for Unsupervised + Keyphrase Extraction + + +
+ This paper proposes Attention-Seeker, an unsupervised keyphrase extraction +method that leverages self-attention maps from a Large Language Model to +estimate the importance of candidate phrases. Our approach identifies specific +components - such as layers, heads, and attention vectors - where the model +pays significant attention to the key topics of the text. The attention weights +provided by these components are then used to score the candidate phrases. +Unlike previous models that require manual tuning of parameters (e.g., +selection of heads, prompts, hyperparameters), Attention-Seeker dynamically +adapts to the input text without any manual adjustments, enhancing its +practical applicability. We evaluate Attention-Seeker on four publicly +available datasets: Inspec, SemEval2010, SemEval2017, and Krapivin. Our results +demonstrate that, even without parameter tuning, Attention-Seeker outperforms +most baseline models, achieving state-of-the-art performance on three out of +four datasets, particularly excelling in extracting keyphrases from long +documents. + +
+
+
+
+
+ + ☆ Challenging Fairness: A Comprehensive Exploration of Bias in LLM-Based + Recommendations + + +
+ Large Language Model (LLM)-based recommendation systems provide more +comprehensive recommendations than traditional systems by deeply analyzing +content and user behavior. However, these systems often exhibit biases, +favoring mainstream content while marginalizing non-traditional options due to +skewed training data. This study investigates the intricate relationship +between bias and LLM-based recommendation systems, with a focus on music, song, +and book recommendations across diverse demographic and cultural groups. +Through a comprehensive analysis conducted over different LLM-models, this +paper evaluates the impact of bias on recommendation outcomes. Our findings +reveal that bias is so deeply ingrained within these systems that even a +simpler intervention like prompt engineering can significantly reduce bias, +underscoring the pervasive nature of the issue. Moreover, factors like +intersecting identities and contextual information, such as socioeconomic +status, further amplify these biases, demonstrating the complexity and depth of +the challenges faced in creating fair recommendations across different groups. + +
+
+
+
+
+ + ♻ ☆ Modeling Sustainable City Trips: Integrating CO2e Emissions, Popularity, + and Seasonality into Tourism Recommender Systems + + +
+ Tourism affects not only the tourism industry but also society and +stakeholders such as the environment, local businesses, and residents. Tourism +Recommender Systems (TRS) can be pivotal in promoting sustainable tourism by +guiding travelers toward destinations with minimal negative impact. Our paper +introduces a composite sustainability indicator for a city trip TRS based on +the users' starting point and month of travel. This indicator integrates CO2e +emissions for different transportation modes and analyses destination +popularity and seasonal demand. We quantify city popularity based on user +reviews, points of interest, and search trends from Tripadvisor and Google +Trends data. To calculate a seasonal demand index, we leverage data from +TourMIS and Airbnb. We conducted a user study to explore the fundamental +trade-offs in travel decision-making and determine the weights for our proposed +indicator. Finally, we demonstrate the integration of this indicator into a +TRS, illustrating its ability to deliver sustainable city trip recommendations. +This work lays the foundation for future research by integrating sustainability +measures and contributing to responsible recommendations by TRS. + +
+
+
+
+
+ + ♻ ☆ Music auto-tagging in the long tail: A few-shot approach + + +
+ In the realm of digital music, using tags to efficiently organize and +retrieve music from extensive databases is crucial for music catalog owners. +Human tagging by experts is labor-intensive but mostly accurate, whereas +automatic tagging through supervised learning has approached satisfying +accuracy but is restricted to a predefined set of training tags. Few-shot +learning offers a viable solution to expand beyond this small set of predefined +tags by enabling models to learn from only a few human-provided examples to +understand tag meanings and subsequently apply these tags autonomously. We +propose to integrate few-shot learning methodology into multi-label music +auto-tagging by using features from pre-trained models as inputs to a +lightweight linear classifier, also known as a linear probe. We investigate +different popular pre-trained features, as well as different few-shot +parametrizations with varying numbers of classes and samples per class. Our +experiments demonstrate that a simple model with pre-trained features can +achieve performance close to state-of-the-art models while using significantly +less training data, such as 20 samples per tag. Additionally, our linear probe +performs competitively with leading models when trained on the entire training +dataset. The results show that this transfer learning-based few-shot approach +could effectively address the issue of automatically assigning long-tail tags +with only limited labeled data. + +
+
+ comment: Published in Audio Engineering Society NY Show 2024 as a Peer + Reviewed (Category 1) paper; typos corrected +
+
+
+
+
+
+
+
+ + Machine Learning 20 + +
+
+
+ + ☆ Time-Series Forecasting, Knowledge Distillation, and Refinement within a + Multimodal PDE Foundation Model + + +
+ Symbolic encoding has been used in multi-operator learning as a way to embed +additional information for distinct time-series data. For spatiotemporal +systems described by time-dependent partial differential equations, the +equation itself provides an additional modality to identify the system. The +utilization of symbolic expressions along side time-series samples allows for +the development of multimodal predictive neural networks. A key challenge with +current approaches is that the symbolic information, i.e. the equations, must +be manually preprocessed (simplified, rearranged, etc.) to match and relate to +the existing token library, which increases costs and reduces flexibility, +especially when dealing with new differential equations. We propose a new token +library based on SymPy to encode differential equations as an additional +modality for time-series models. The proposed approach incurs minimal cost, is +automated, and maintains high prediction accuracy for forecasting tasks. +Additionally, we include a Bayesian filtering module that connects the +different modalities to refine the learned equation. This improves the accuracy +of the learned symbolic representation and the predicted time-series. + +
+
+
+
+
+ + ☆ DiffESM: Conditional Emulation of Temperature and Precipitation in Earth + System Models with 3D Diffusion Models + + +
+ Earth System Models (ESMs) are essential for understanding the interaction +between human activities and the Earth's climate. However, the computational +demands of ESMs often limit the number of simulations that can be run, +hindering the robust analysis of risks associated with extreme weather events. +While low-cost climate emulators have emerged as an alternative to emulate ESMs +and enable rapid analysis of future climate, many of these emulators only +provide output on at most a monthly frequency. This temporal resolution is +insufficient for analyzing events that require daily characterization, such as +heat waves or heavy precipitation. We propose using diffusion models, a class +of generative deep learning models, to effectively downscale ESM output from a +monthly to a daily frequency. Trained on a handful of ESM realizations, +reflecting a wide range of radiative forcings, our DiffESM model takes monthly +mean precipitation or temperature as input, and is capable of producing daily +values with statistical characteristics close to ESM output. Combined with a +low-cost emulator providing monthly means, this approach requires only a small +fraction of the computational resources needed to run a large ensemble. We +evaluate model behavior using a number of extreme metrics, showing that DiffESM +closely matches the spatio-temporal behavior of the ESM output it emulates in +terms of the frequency and spatial characteristics of phenomena such as heat +waves, dry spells, or rainfall intensity. + +
+
+ comment: Accepted for publication in Journal of Advances in Modeling Earth + Systems +
+
+
+
+
+ + ☆ No Saved Kaleidosope: an 100% Jitted Neural Network Coding Language with + Pythonic Syntax + + +
+ We developed a jitted compiler for training Artificial Neural Networks using +C++, LLVM and Cuda. It features object-oriented characteristics, strong typing, +parallel workers for data pre-processing, pythonic syntax for expressions, +PyTorch like model declaration and Automatic Differentiation. We implement the +mechanisms of cache and pooling in order to manage VRAM, cuBLAS for high +performance matrix multiplication and cuDNN for convolutional layers. Our +experiments with Residual Convolutional Neural Networks on ImageNet, we reach +similar speed but degraded performance. Also, the GRU network experiments show +similar accuracy, but our compiler have degraded speed in that task. However, +our compiler demonstrates promising results at the CIFAR-10 benchmark, in which +we reach the same performance and about the same speed as PyTorch. We make the +code publicly available at: https://github.com/NoSavedDATA/NoSavedKaleidoscope + +
+
+ comment: 12 pages, 3 figures and 3 tables +
+
+
+
+
+ + ☆ The Sample Complexity of Smooth Boosting and the Tightness of the + Hardcore Theorem + + +
+ Smooth boosters generate distributions that do not place too much weight on +any given example. Originally introduced for their noise-tolerant properties, +such boosters have also found applications in differential privacy, +reproducibility, and quantum learning theory. We study and settle the sample +complexity of smooth boosting: we exhibit a class that can be weak learned to +$\gamma$-advantage over smooth distributions with $m$ samples, for which strong +learning over the uniform distribution requires +$\tilde{\Omega}(1/\gamma^2)\cdot m$ samples. This matches the overhead of +existing smooth boosters and provides the first separation from the setting of +distribution-independent boosting, for which the corresponding overhead is +$O(1/\gamma)$. + Our work also sheds new light on Impagliazzo's hardcore theorem from +complexity theory, all known proofs of which can be cast in the framework of +smooth boosting. For a function $f$ that is mildly hard against size-$s$ +circuits, the hardcore theorem provides a set of inputs on which $f$ is +extremely hard against size-$s'$ circuits. A downside of this important result +is the loss in circuit size, i.e. that $s' \ll s$. Answering a question of +Trevisan, we show that this size loss is necessary and in fact, the parameters +achieved by known proofs are the best possible. + +
+
+ comment: 46 pages, FOCS 2024 +
+
+
+
+
+ + ☆ Outlier Detection with Cluster Catch Digraphs + + +
+ This paper introduces a novel family of outlier detection algorithms based on +Cluster Catch Digraphs (CCDs), specifically tailored to address the challenges +of high dimensionality and varying cluster shapes, which deteriorate the +performance of most traditional outlier detection methods. We propose the +Uniformity-Based CCD with Mutual Catch Graph (U-MCCD), the Uniformity- and +Neighbor-Based CCD with Mutual Catch Graph (UN-MCCD), and their shape-adaptive +variants (SU-MCCD and SUN-MCCD), which are designed to detect outliers in data +sets with arbitrary cluster shapes and high dimensions. We present the +advantages and shortcomings of these algorithms and provide the motivation or +need to define each particular algorithm. Through comprehensive Monte Carlo +simulations, we assess their performance and demonstrate the robustness and +effectiveness of our algorithms across various settings and contamination +levels. We also illustrate the use of our algorithms on various real-life data +sets. The U-MCCD algorithm efficiently identifies outliers while maintaining +high true negative rates, and the SU-MCCD algorithm shows substantial +improvement in handling non-uniform clusters. Additionally, the UN-MCCD and +SUN-MCCD algorithms address the limitations of existing methods in +high-dimensional spaces by utilizing Nearest Neighbor Distances (NND) for +clustering and outlier detection. Our results indicate that these novel +algorithms offer substantial advancements in the accuracy and adaptability of +outlier detection, providing a valuable tool for various real-world +applications. + Keyword: Outlier detection, Graph-based clustering, Cluster catch digraphs, +$k$-nearest-neighborhood, Mutual catch graphs, Nearest neighbor distance. + +
+
+ comment: 73 pages, 146 figures +
+
+
+
+
+ + ☆ Self-Contrastive Forward-Forward Algorithm + + +
+ The Forward-Forward (FF) algorithm is a recent, purely forward-mode learning +method, that updates weights locally and layer-wise and supports supervised as +well as unsupervised learning. These features make it ideal for applications +such as brain-inspired learning, low-power hardware neural networks, and +distributed learning in large models. However, while FF has shown promise on +written digit recognition tasks, its performance on natural images and +time-series remains a challenge. A key limitation is the need to generate +high-quality negative examples for contrastive learning, especially in +unsupervised tasks, where versatile solutions are currently lacking. To address +this, we introduce the Self-Contrastive Forward-Forward (SCFF) method, inspired +by self-supervised contrastive learning. SCFF generates positive and negative +examples applicable across different datasets, surpassing existing local +forward algorithms for unsupervised classification accuracy on MNIST (MLP: +98.7%), CIFAR-10 (CNN: 80.75%), and STL-10 (CNN: 77.3%). Additionally, SCFF is +the first to enable FF training of recurrent neural networks, opening the door +to more complex tasks and continuous-time video and text processing. + +
+
+
+
+
+ + ☆ Advances in APPFL: A Comprehensive and Extensible Federated Learning + Framework + + +
+ Federated learning (FL) is a distributed machine learning paradigm enabling +collaborative model training while preserving data privacy. In today's +landscape, where most data is proprietary, confidential, and distributed, FL +has become a promising approach to leverage such data effectively, particularly +in sensitive domains such as medicine and the electric grid. Heterogeneity and +security are the key challenges in FL, however; most existing FL frameworks +either fail to address these challenges adequately or lack the flexibility to +incorporate new solutions. To this end, we present the recent advances in +developing APPFL, an extensible framework and benchmarking suite for federated +learning, which offers comprehensive solutions for heterogeneity and security +concerns, as well as user-friendly interfaces for integrating new algorithms or +adapting to new applications. We demonstrate the capabilities of APPFL through +extensive experiments evaluating various aspects of FL, including communication +efficiency, privacy preservation, computational performance, and resource +utilization. We further highlight the extensibility of APPFL through case +studies in vertical, hierarchical, and decentralized FL. APPFL is open-sourced +at https://github.com/APPFL/APPFL. + +
+
+
+
+
+ + ☆ Automating proton PBS treatment planning for head and neck cancers using + policy gradient-based deep reinforcement learning + + +
+ Proton pencil beam scanning (PBS) treatment planning for head and neck (H&N) +cancers is a time-consuming and experience-demanding task where a large number +of planning objectives are involved. Deep reinforcement learning (DRL) has +recently been introduced to the planning processes of intensity-modulated +radiation therapy and brachytherapy for prostate, lung, and cervical cancers. +However, existing approaches are built upon the Q-learning framework and +weighted linear combinations of clinical metrics, suffering from poor +scalability and flexibility and only capable of adjusting a limited number of +planning objectives in discrete action spaces. We propose an automatic +treatment planning model using the proximal policy optimization (PPO) algorithm +and a dose distribution-based reward function for proton PBS treatment planning +of H&N cancers. Specifically, a set of empirical rules is used to create +auxiliary planning structures from target volumes and organs-at-risk (OARs), +along with their associated planning objectives. These planning objectives are +fed into an in-house optimization engine to generate the spot monitor unit (MU) +values. A decision-making policy network trained using PPO is developed to +iteratively adjust the involved planning objective parameters in a continuous +action space and refine the PBS treatment plans using a novel dose +distribution-based reward function. Proton H&N treatment plans generated by the +model show improved OAR sparing with equal or superior target coverage when +compared with human-generated plans. Moreover, additional experiments on liver +cancer demonstrate that the proposed method can be successfully generalized to +other treatment sites. To the best of our knowledge, this is the first +DRL-based automatic treatment planning model capable of achieving human-level +performance for H&N cancers. + +
+
+
+
+
+ + ☆ Preference Tuning with Human Feedback on Language, Speech, and Vision + Tasks: A Survey + + +
+ Preference tuning is a crucial process for aligning deep generative models +with human preferences. This survey offers a thorough overview of recent +advancements in preference tuning and the integration of human feedback. The +paper is organized into three main sections: 1) introduction and preliminaries: +an introduction to reinforcement learning frameworks, preference tuning tasks, +models, and datasets across various modalities: language, speech, and vision, +as well as different policy approaches, 2) in-depth examination of each +preference tuning approach: a detailed analysis of the methods used in +preference tuning, and 3) applications, discussion, and future directions: an +exploration of the applications of preference tuning in downstream tasks, +including evaluation methods for different modalities, and an outlook on future +research directions. Our objective is to present the latest methodologies in +preference tuning and model alignment, enhancing the understanding of this +field for researchers and practitioners. We hope to encourage further +engagement and innovation in this area. + +
+
+ comment: Survey paper +
+
+
+
+
+ + ☆ Discrete Unit based Masking for Improving Disentanglement in Voice + Conversion + + +
+ Voice conversion (VC) aims to modify the speaker's identity while preserving +the linguistic content. Commonly, VC methods use an encoder-decoder +architecture, where disentangling the speaker's identity from linguistic +information is crucial. However, the disentanglement approaches used in these +methods are limited as the speaker features depend on the phonetic content of +the utterance, compromising disentanglement. This dependency is amplified with +attention-based methods. To address this, we introduce a novel masking +mechanism in the input before speaker encoding, masking certain discrete speech +units that correspond highly with phoneme classes. Our work aims to reduce the +phonetic dependency of speaker features by restricting access to some phonetic +information. Furthermore, since our approach is at the input level, it is +applicable to any encoder-decoder based VC framework. Our approach improves +disentanglement and conversion performance across multiple VC methods, showing +significant effectiveness, particularly in attention-based method, with 44% +relative improvement in objective intelligibility. + +
+
+ comment: Accepted to IEEE SLT 2024 +
+
+
+
+
+ + ☆ A Property Encoder for Graph Neural Networks + + +
+ Graph machine learning, particularly using graph neural networks, +fundamentally relies on node features. Nevertheless, numerous real-world +systems, such as social and biological networks, often lack node features due +to various reasons, including privacy concerns, incomplete or missing data, and +limitations in data collection. In such scenarios, researchers typically resort +to methods like structural and positional encoding to construct node features. +However, the length of such features is contingent on the maximum value within +the property being encoded, for example, the highest node degree, which can be +exceedingly large in applications like scale-free networks. Furthermore, these +encoding schemes are limited to categorical data and might not be able to +encode metrics returning other type of values. In this paper, we introduce a +novel, universally applicable encoder, termed PropEnc, which constructs +expressive node embedding from any given graph metric. PropEnc leverages +histogram construction combined with reverse index encoding, offering a +flexible method for node features initialization. It supports flexible encoding +in terms of both dimensionality and type of input, demonstrating its +effectiveness across diverse applications. PropEnc allows encoding metrics in +low-dimensional space which effectively avoids the issue of sparsity and +enhances the efficiency of the models. We show that \emph{PropEnc} can +construct node features that either exactly replicate one-hot encoding or +closely approximate indices under various settings. Our extensive evaluations +in graph classification setting across multiple social networks that lack node +features support our hypothesis. The empirical results conclusively demonstrate +that PropEnc is both an efficient and effective mechanism for constructing node +features from diverse set of graph metrics. + +
+
+ comment: conference paper +
+
+
+
+
+ + ☆ VALO: A Versatile Anytime Framework for LiDAR-based Object Detection + Deep Neural Networks + + +
+ This work addresses the challenge of adapting dynamic deadline requirements +for LiDAR object detection deep neural networks (DNNs). The computing latency +of object detection is critically important to ensure safe and efficient +navigation. However, state-of-the-art LiDAR object detection DNNs often exhibit +significant latency, hindering their real-time performance on +resource-constrained edge platforms. Therefore, a tradeoff between detection +accuracy and latency should be dynamically managed at runtime to achieve +optimum results. + In this paper, we introduce VALO (Versatile Anytime algorithm for LiDAR +Object detection), a novel data-centric approach that enables anytime computing +of 3D LiDAR object detection DNNs. VALO employs a deadline-aware scheduler to +selectively process input regions, making execution time and accuracy tradeoffs +without architectural modifications. Additionally, it leverages efficient +forecasting of past detection results to mitigate possible loss of accuracy due +to partial processing of input. Finally, it utilizes a novel input reduction +technique within its detection heads to significantly accelerate execution +without sacrificing accuracy. + We implement VALO on state-of-the-art 3D LiDAR object detection networks, +namely CenterPoint and VoxelNext, and demonstrate its dynamic adaptability to a +wide range of time constraints while achieving higher accuracy than the prior +state-of-the-art. Code is available +athttps://github.com/CSL-KU/VALO}{github.com/CSL-KU/VALO. + +
+
+
+
+
+ + ☆ Balancing Optimality and Diversity: Human-Centered Decision Making + through Generative Curation + + +
+ The surge in data availability has inundated decision-makers with an +overwhelming array of choices. While existing approaches focus on optimizing +decisions based on quantifiable metrics, practical decision-making often +requires balancing measurable quantitative criteria with unmeasurable +qualitative factors embedded in the broader context. In such cases, algorithms +can generate high-quality recommendations, but the final decision rests with +the human, who must weigh both dimensions. We define the process of selecting +the optimal set of algorithmic recommendations in this context as +human-centered decision making. To address this challenge, we introduce a novel +framework called generative curation, which optimizes the true desirability of +decision options by integrating both quantitative and qualitative aspects. Our +framework uses a Gaussian process to model unknown qualitative factors and +derives a diversity metric that balances quantitative optimality with +qualitative diversity. This trade-off enables the generation of a manageable +subset of diverse, near-optimal actions that are robust to unknown qualitative +preferences. To operationalize this framework, we propose two implementation +approaches: a generative neural network architecture that produces a +distribution $\pi$ to efficiently sample a diverse set of near-optimal actions, +and a sequential optimization method to iteratively generates solutions that +can be easily incorporated into complex optimization formulations. We validate +our approach with extensive datasets, demonstrating its effectiveness in +enhancing decision-making processes across a range of complex environments, +with significant implications for policy and management. + +
+
+
+
+
+ + ♻ ☆ Designing Observables for Measurements with Deep Learning + + +
+ Many analyses in particle and nuclear physics use simulations to infer +fundamental, effective, or phenomenological parameters of the underlying +physics models. When the inference is performed with unfolded cross sections, +the observables are designed using physics intuition and heuristics. We propose +to design targeted observables with machine learning. Unfolded, differential +cross sections in a neural network output contain the most information about +parameters of interest and can be well-measured by construction. The networks +are trained using a custom loss function that rewards outputs that are +sensitive to the parameter(s) of interest while simultaneously penalizing +outputs that are different between particle-level and detector-level (to +minimize detector distortions). We demonstrate this idea in simulation using +two physics models for inclusive measurements in deep inelastic scattering. We +find that the new approach is more sensitive than classical observables at +distinguishing the two models and also has a reduced unfolding uncertainty due +to the reduced detector distortions. + +
+
+ comment: This is the version published in EPJC +
+
+
+
+
+ + ♻ ☆ MMLU-Pro+: Evaluating Higher-Order Reasoning and Shortcut Learning in + LLMs + + +
+ Existing benchmarks for large language models (LLMs) increasingly struggle to +differentiate between top-performing models, underscoring the need for more +challenging evaluation frameworks. We introduce MMLU-Pro+, an enhanced +benchmark building upon MMLU-Pro to assess shortcut learning and higher-order +reasoning in LLMs. By incorporating questions with multiple correct answers +across diverse domains, MMLU-Pro+ tests LLMs' ability to engage in complex +reasoning and resist simplistic problem-solving strategies. Our results show +that MMLU-Pro+ maintains MMLU-Pro's difficulty while providing a more rigorous +test of model discrimination, particularly in multi-correct answer scenarios. +We introduce novel metrics like shortcut selection ratio and correct pair +identification ratio, offering deeper insights into model behavior and +anchoring bias. Evaluations of six state-of-the-art LLMs reveal significant +performance gaps, highlighting variations in reasoning abilities and bias +susceptibility. We release the dataset and evaluation codes at +\url{https://github.com/asgsaeid/mmlu-pro-plus}. + +
+
+
+
+
+ + ♻ ☆ Initial Guessing Bias: How Untrained Networks Favor Some Classes + + +
+ Understanding and controlling biasing effects in neural networks is crucial +for ensuring accurate and fair model performance. In the context of +classification problems, we provide a theoretical analysis demonstrating that +the structure of a deep neural network (DNN) can condition the model to assign +all predictions to the same class, even before the beginning of training, and +in the absence of explicit biases. We prove that, besides dataset properties, +the presence of this phenomenon, which we call \textit{Initial Guessing Bias} +(IGB), is influenced by model choices including dataset preprocessing methods, +and architectural decisions, such as activation functions, max-pooling layers, +and network depth. Our analysis of IGB provides information for architecture +selection and model initialization. We also highlight theoretical consequences, +such as the breakdown of node-permutation symmetry, the violation of +self-averaging and the non-trivial effects that depth has on the phenomenon. + +
+
+ comment: Updated the notation to enhance clarity +
+
+
+
+
+ + ♻ ☆ MEMO-QCD: Quantum Density Estimation through Memetic Optimisation for + Quantum Circuit Design + + +
+ This paper presents a strategy for efficient quantum circuit design for +density estimation. The strategy is based on a quantum-inspired algorithm for +density estimation and a circuit optimisation routine based on memetic +algorithms. The model maps a training dataset to a quantum state represented by +a density matrix through a quantum feature map. This training state encodes the +probability distribution of the dataset in a quantum state, such that the +density of a new sample can be estimated by projecting its corresponding +quantum state onto the training state. We propose the application of a memetic +algorithm to find the architecture and parameters of a variational quantum +circuit that implements the quantum feature map, along with a variational +learning strategy to prepare the training state. Demonstrations of the proposed +strategy show an accurate approximation of the Gaussian kernel density +estimation method through shallow quantum circuits illustrating the feasibility +of the algorithm for near-term quantum hardware. + +
+
+ comment: 15 pages, 8 figures, presented at QTML 2023 +
+
+
+
+
+ + ♻ ☆ Bagged Polynomial Regression and Neural Networks + + +
+ Series and polynomial regression are able to approximate the same function +classes as neural networks. However, these methods are rarely used in practice, +although they offer more interpretability than neural networks. In this paper, +we show that a potential reason for this is the slow convergence rate of +polynomial regression estimators and propose the use of \textit{bagged} +polynomial regression (BPR) as an attractive alternative to neural networks. +Theoretically, we derive new finite sample and asymptotic $L^2$ convergence +rates for series estimators. We show that the rates can be improved in smooth +settings by splitting the feature space and generating polynomial features +separately for each partition. Empirically, we show that our proposed +estimator, the BPR, can perform as well as more complex models with more +parameters. Our estimator also performs close to state-of-the-art prediction +methods in the benchmark MNIST handwritten digit dataset. We demonstrate that +BPR performs as well as neural networks in crop classification using satellite +data, a setting where prediction accuracy is critical and interpretability is +often required for addressing research questions. + +
+
+
+
+
+ + ♻ ☆ Hyper-STTN: Social Group-aware Spatial-Temporal Transformer Network for + Human Trajectory Prediction with Hypergraph Reasoning + + +
+ Predicting crowded intents and trajectories is crucial in varouls real-world +applications, including service robots and autonomous vehicles. Understanding +environmental dynamics is challenging, not only due to the complexities of +modeling pair-wise spatial and temporal interactions but also the diverse +influence of group-wise interactions. To decode the comprehensive pair-wise and +group-wise interactions in crowded scenarios, we introduce Hyper-STTN, a +Hypergraph-based Spatial-Temporal Transformer Network for crowd trajectory +prediction. In Hyper-STTN, crowded group-wise correlations are constructed +using a set of multi-scale hypergraphs with varying group sizes, captured +through random-walk robability-based hypergraph spectral convolution. +Additionally, a spatial-temporal transformer is adapted to capture pedestrians' +pair-wise latent interactions in spatial-temporal dimensions. These +heterogeneous group-wise and pair-wise are then fused and aligned though a +multimodal transformer network. Hyper-STTN outperformes other state-of-the-art +baselines and ablation models on 5 real-world pedestrian motion datasets. + +
+
+
+
+
+ + ♻ ☆ Enhancing Worldwide Image Geolocation by Ensembling Satellite-Based + Ground-Level Attribute Predictors + + +
+ We examine the challenge of estimating the location of a single ground-level +image in the absence of GPS or other location metadata. Currently, geolocation +systems are evaluated by measuring the Great Circle Distance between the +predicted location and ground truth. Because this measurement only uses a +single point, it cannot assess the distribution of predictions by geolocation +systems. Evaluation of a distribution of potential locations (areas) is +required when there are follow-on procedures to further narrow down or verify +the location. This is especially important in poorly-sampled regions e.g. rural +and wilderness areas. + In this paper, we introduce a novel metric, Recall vs Area (RvA), which +measures the accuracy of estimated distributions of locations. RvA treats image +geolocation results similarly to document retrieval, measuring recall as a +function of area: For a ranked list of (possibly discontiguous) predicted +regions, we measure the area required for accumulated regions to contain the +ground truth coordinate. This produces a curve similar to a precision-recall +curve, where "precision" is replaced by square kilometers area, enabling +evaluation for different downstream search area budgets. + Following from this view of the problem, we then examine an ensembling +approach to global-scale image geolocation, which incorporates information from +multiple sources, and can readily incorporate multiple models, attribute +predictors, and data sources. We study its effectiveness by combining the +geolocation models GeoEstimation and the current state-of-the-art, GeoCLIP, +with attribute predictors based on Oak Ridge National Laboratory LandScan and +European Space Agency Climate Change Initiative Land Cover. We find significant +improvements in image geolocation for areas that are under-represented in the +training set, particularly non-urban areas, on both Im2GPS3k and Street View +images. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ NVLM: Open Frontier-Class Multimodal LLMs + + +
+ We introduce NVLM 1.0, a family of frontier-class multimodal large language +models (LLMs) that achieve state-of-the-art results on vision-language tasks, +rivaling the leading proprietary models (e.g., GPT-4o) and open-access models +(e.g., Llama 3-V 405B and InternVL 2). Remarkably, NVLM 1.0 shows improved +text-only performance over its LLM backbone after multimodal training. In terms +of model design, we perform a comprehensive comparison between decoder-only +multimodal LLMs (e.g., LLaVA) and cross-attention-based models (e.g., +Flamingo). Based on the strengths and weaknesses of both approaches, we propose +a novel architecture that enhances both training efficiency and multimodal +reasoning capabilities. Furthermore, we introduce a 1-D tile-tagging design for +tile-based dynamic high-resolution images, which significantly boosts +performance on multimodal reasoning and OCR-related tasks. Regarding training +data, we meticulously curate and provide detailed information on our multimodal +pretraining and supervised fine-tuning datasets. Our findings indicate that +dataset quality and task diversity are more important than scale, even during +the pretraining phase, across all architectures. Notably, we develop +production-grade multimodality for the NVLM-1.0 models, enabling them to excel +in vision-language tasks while maintaining and even improving text-only +performance compared to their LLM backbones. To achieve this, we craft and +integrate a high-quality text-only dataset into multimodal training, alongside +a substantial amount of multimodal math and reasoning data, leading to enhanced +math and coding capabilities across modalities. To advance research in the +field, we are releasing the model weights and will open-source the code for the +community: https://nvlm-project.github.io/. + +
+
+
+
+
+ + ☆ Enhancing Few-Shot Classification without Forgetting through Multi-Level + Contrastive Constraints + + +
+ Most recent few-shot learning approaches are based on meta-learning with +episodic training. However, prior studies encounter two crucial problems: (1) +\textit{the presence of inductive bias}, and (2) \textit{the occurrence of +catastrophic forgetting}. In this paper, we propose a novel Multi-Level +Contrastive Constraints (MLCC) framework, that jointly integrates +within-episode learning and across-episode learning into a unified interactive +learning paradigm to solve these issues. Specifically, we employ a space-aware +interaction modeling scheme to explore the correct inductive paradigms for each +class between within-episode similarity/dis-similarity distributions. +Additionally, with the aim of better utilizing former prior knowledge, a +cross-stage distribution adaption strategy is designed to align the +across-episode distributions from different time stages, thus reducing the +semantic gap between existing and past prediction distribution. Extensive +experiments on multiple few-shot datasets demonstrate the consistent +superiority of MLCC approach over the existing state-of-the-art baselines. + +
+
+
+
+
+ + ☆ Less is More: A Simple yet Effective Token Reduction Method for + Efficient Multi-modal LLMs + + +
+ The rapid advancement of Multimodal Large Language Models (MLLMs) has led to +remarkable performances across various domains. However, this progress is +accompanied by a substantial surge in the resource consumption of these models. +We address this pressing issue by introducing a new approach, Token Reduction +using CLIP Metric (TRIM), aimed at improving the efficiency of MLLMs without +sacrificing their performance. Inspired by human attention patterns in Visual +Question Answering (VQA) tasks, TRIM presents a fresh perspective on the +selection and reduction of image tokens. The TRIM method has been extensively +tested across 12 datasets, and the results demonstrate a significant reduction +in computational overhead while maintaining a consistent level of performance. +This research marks a critical stride in efficient MLLM development, promoting +greater accessibility and sustainability of high-performing models. + +
+
+ comment: 9 pages, 3 figures, 6 tables +
+
+
+
+
+ + ☆ Towards Effective User Attribution for Latent Diffusion Models via + Watermark-Informed Blending + + +
+ Rapid advancements in multimodal large language models have enabled the +creation of hyper-realistic images from textual descriptions. However, these +advancements also raise significant concerns about unauthorized use, which +hinders their broader distribution. Traditional watermarking methods often +require complex integration or degrade image quality. To address these +challenges, we introduce a novel framework Towards Effective user Attribution +for latent diffusion models via Watermark-Informed Blending (TEAWIB). TEAWIB +incorporates a unique ready-to-use configuration approach that allows seamless +integration of user-specific watermarks into generative models. This approach +ensures that each user can directly apply a pre-configured set of parameters to +the model without altering the original model parameters or compromising image +quality. Additionally, noise and augmentation operations are embedded at the +pixel level to further secure and stabilize watermarked images. Extensive +experiments validate the effectiveness of TEAWIB, showcasing the +state-of-the-art performance in perceptual quality and attribution accuracy. + +
+
+ comment: 9 pages, 7 figures +
+
+
+
+
+ + ☆ 3DFacePolicy: Speech-Driven 3D Facial Animation with Diffusion Policy + + +
+ Audio-driven 3D facial animation has made immersive progress both in research +and application developments. The newest approaches focus on Transformer-based +methods and diffusion-based methods, however, there is still gap in the +vividness and emotional expression between the generated animation and real +human face. To tackle this limitation, we propose 3DFacePolicy, a diffusion +policy model for 3D facial animation prediction. This method generates variable +and realistic human facial movements by predicting the 3D vertex trajectory on +the 3D facial template with diffusion policy instead of facial generation for +every frame. It takes audio and vertex states as observations to predict the +vertex trajectory and imitate real human facial expressions, which keeps the +continuous and natural flow of human emotions. The experiments show that our +approach is effective in variable and dynamic facial motion synthesizing. + +
+
+
+
+
+ + ☆ PDMX: A Large-Scale Public Domain MusicXML Dataset for Symbolic Music + Processing + + +
+ The recent explosion of generative AI-Music systems has raised numerous +concerns over data copyright, licensing music from musicians, and the conflict +between open-source AI and large prestige companies. Such issues highlight the +need for publicly available, copyright-free musical data, in which there is a +large shortage, particularly for symbolic music data. To alleviate this issue, +we present PDMX: a large-scale open-source dataset of over 250K public domain +MusicXML scores collected from the score-sharing forum MuseScore, making it the +largest available copyright-free symbolic music dataset to our knowledge. PDMX +additionally includes a wealth of both tag and user interaction metadata, +allowing us to efficiently analyze the dataset and filter for high quality +user-generated scores. Given the additional metadata afforded by our data +collection process, we conduct multitrack music generation experiments +evaluating how different representative subsets of PDMX lead to different +behaviors in downstream models, and how user-rating statistics can be used as +an effective measure of data quality. Examples can be found at +https://pnlong.github.io/PDMX.demo/. + +
+
+
+
+
+
+
+ + + + + + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`